feat: Add new compact MetricInResponse type (#1593)

# What does this PR do? This change adds a compact type to include metrics in response as opposed to the full MetricEvent which is relevant for internal logging purposes. ## Test Plan ``` LLAMA_STACK_CONFIG=~/.llama/distributions/fireworks/fireworks-run.yaml pytest -s -v agents/test_agents.py --safety-shield meta-llama/Llama-Guard-3-8B --text-model meta-llama/Llama-3.1-8B-Instruct llama stack run ~/.llama/distributions/fireworks/fireworks-run.yaml curl --request POST \ --url http://localhost:8321/v1/inference/chat-completion \ --header 'content-type: application/json' \ --data '{ "model_id": "meta-llama/Llama-3.1-70B-Instruct", "messages": [ { "role": "user", "content": { "type": "text", "text": "where do humans live" } } ], "stream": false }' { "metrics": [ { "metric": "prompt_tokens", "value": 10, "unit": null }, { "metric": "completion_tokens", "value": 522, "unit": null }, { "metric": "total_tokens", "value": 532, "unit": null } ], "completion_message": { "role": "assistant", "content": "Humans live in various parts of the world...............", "stop_reason": "out_of_tokens", "tool_calls": [] }, "logprobs": null } ```
2025-03-12 15:45:44 -07:00 · 2025-03-12 15:45:44 -07:00 · 99bbe0e70b
commit 99bbe0e70b
parent ad939c97c3
4 changed files with 150 additions and 80 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -4549,7 +4549,7 @@
                    "metrics": {
                        "type": "array",
                        "items": {
-                            "$ref": "#/components/schemas/MetricEvent"
+                            "$ref": "#/components/schemas/MetricInResponse"
                        }
                    },
                    "completion_message": {
@ -4571,46 +4571,9 @@
                "title": "ChatCompletionResponse",
                "description": "Response from a chat completion request."
            },
-            "MetricEvent": {
+            "MetricInResponse": {
                "type": "object",
                "properties": {
-                    "trace_id": {
-                        "type": "string"
-                    },
-                    "span_id": {
-                        "type": "string"
-                    },
-                    "timestamp": {
-                        "type": "string",
-                        "format": "date-time"
-                    },
-                    "attributes": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "integer"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "null"
-                                }
-                            ]
-                        }
-                    },
-                    "type": {
-                        "type": "string",
-                        "const": "metric",
-                        "default": "metric"
-                    },
                    "metric": {
                        "type": "string"
                    },
@ -4630,15 +4593,10 @@
                },
                "additionalProperties": false,
                "required": [
-                    "trace_id",
-                    "span_id",
-                    "timestamp",
-                    "type",
                    "metric",
-                    "value",
-                    "unit"
+                    "value"
                ],
-                "title": "MetricEvent"
+                "title": "MetricInResponse"
            },
            "TokenLogProbs": {
                "type": "object",
@ -4715,6 +4673,12 @@
            "CompletionResponse": {
                "type": "object",
                "properties": {
+                    "metrics": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/MetricInResponse"
+                        }
+                    },
                    "content": {
                        "type": "string",
                        "description": "The generated completion text"
@ -4924,7 +4888,7 @@
                    "metrics": {
                        "type": "array",
                        "items": {
-                            "$ref": "#/components/schemas/MetricEvent"
+                            "$ref": "#/components/schemas/MetricInResponse"
                        }
                    },
                    "event": {
@ -5082,6 +5046,12 @@
            "CompletionResponseStreamChunk": {
                "type": "object",
                "properties": {
+                    "metrics": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/MetricInResponse"
+                        }
+                    },
                    "delta": {
                        "type": "string",
                        "description": "New content generated since last chunk. This can be one or more tokens."
@ -8363,6 +8333,75 @@
                ],
                "title": "LogSeverity"
            },
+            "MetricEvent": {
+                "type": "object",
+                "properties": {
+                    "trace_id": {
+                        "type": "string"
+                    },
+                    "span_id": {
+                        "type": "string"
+                    },
+                    "timestamp": {
+                        "type": "string",
+                        "format": "date-time"
+                    },
+                    "attributes": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "integer"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "null"
+                                }
+                            ]
+                        }
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "metric",
+                        "default": "metric"
+                    },
+                    "metric": {
+                        "type": "string"
+                    },
+                    "value": {
+                        "oneOf": [
+                            {
+                                "type": "integer"
+                            },
+                            {
+                                "type": "number"
+                            }
+                        ]
+                    },
+                    "unit": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "trace_id",
+                    "span_id",
+                    "timestamp",
+                    "type",
+                    "metric",
+                    "value",
+                    "unit"
+                ],
+                "title": "MetricEvent"
+            },
            "SpanEndPayload": {
                "type": "object",
                "properties": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -3101,7 +3101,7 @@ components:
        metrics:
          type: array
          items:
-            $ref: '#/components/schemas/MetricEvent'
+            $ref: '#/components/schemas/MetricInResponse'
        completion_message:
          $ref: '#/components/schemas/CompletionMessage'
          description: The complete response message
@ -3116,29 +3116,9 @@ components:
        - completion_message
      title: ChatCompletionResponse
      description: Response from a chat completion request.
-    MetricEvent:
+    MetricInResponse:
      type: object
      properties:
-        trace_id:
-          type: string
-        span_id:
-          type: string
-        timestamp:
-          type: string
-          format: date-time
-        attributes:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: string
-              - type: integer
-              - type: number
-              - type: boolean
-              - type: 'null'
-        type:
-          type: string
-          const: metric
-          default: metric
        metric:
          type: string
        value:
@ -3149,14 +3129,9 @@ components:
          type: string
      additionalProperties: false
      required:
-        - trace_id
-        - span_id
-        - timestamp
-        - type
        - metric
        - value
-        - unit
-      title: MetricEvent
+      title: MetricInResponse
    TokenLogProbs:
      type: object
      properties:
@ -3213,6 +3188,10 @@ components:
    CompletionResponse:
      type: object
      properties:
+        metrics:
+          type: array
+          items:
+            $ref: '#/components/schemas/MetricInResponse'
        content:
          type: string
          description: The generated completion text
@ -3412,7 +3391,7 @@ components:
        metrics:
          type: array
          items:
-            $ref: '#/components/schemas/MetricEvent'
+            $ref: '#/components/schemas/MetricInResponse'
        event:
          $ref: '#/components/schemas/ChatCompletionResponseEvent'
          description: The event containing the new content
@ -3531,6 +3510,10 @@ components:
    CompletionResponseStreamChunk:
      type: object
      properties:
+        metrics:
+          type: array
+          items:
+            $ref: '#/components/schemas/MetricInResponse'
        delta:
          type: string
          description: >-
@ -5703,6 +5686,47 @@ components:
        - error
        - critical
      title: LogSeverity
+    MetricEvent:
+      type: object
+      properties:
+        trace_id:
+          type: string
+        span_id:
+          type: string
+        timestamp:
+          type: string
+          format: date-time
+        attributes:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: string
+              - type: integer
+              - type: number
+              - type: boolean
+              - type: 'null'
+        type:
+          type: string
+          const: metric
+          default: metric
+        metric:
+          type: string
+        value:
+          oneOf:
+            - type: integer
+            - type: number
+        unit:
+          type: string
+      additionalProperties: false
+      required:
+        - trace_id
+        - span_id
+        - timestamp
+        - type
+        - metric
+        - value
+        - unit
+      title: MetricEvent
    SpanEndPayload:
      type: object
      properties:
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@ -96,6 +96,13 @@ class MetricEvent(EventCommon):
    unit: str


+@json_schema_type
+class MetricInResponse(BaseModel):
+    metric: str
+    value: Union[int, float]
+    unit: Optional[str] = None
+
+
 # This is a short term solution to allow inference API to return metrics
 # The ideal way to do this is to have a way for all response types to include metrics
 # and all metric events logged to the telemetry API to be inlcuded with the response
@ -117,7 +124,7 @@ class MetricEvent(EventCommon):


 class MetricResponseMixin(BaseModel):
-    metrics: Optional[List[MetricEvent]] = None
+    metrics: Optional[List[MetricInResponse]] = None


@json_schema_type
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@ -48,7 +48,7 @@ from llama_stack.apis.scoring import (
    ScoringFnParams,
 )
 from llama_stack.apis.shields import Shield
-from llama_stack.apis.telemetry import MetricEvent, Telemetry
+from llama_stack.apis.telemetry import MetricEvent, MetricInResponse, Telemetry
 from llama_stack.apis.tools import (
    RAGDocument,
    RAGQueryConfig,
@ -206,12 +206,12 @@ class InferenceRouter(Inference):
        completion_tokens: int,
        total_tokens: int,
        model: Model,
-    ) -> List[MetricEvent]:
+    ) -> List[MetricInResponse]:
        metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model)
        if self.telemetry:
            for metric in metrics:
                await self.telemetry.log_event(metric)
-        return metrics
+        return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]

    async def _count_tokens(
        self,