From 99bbe0e70b125f93da659ca722a9d5c2f6ef7022 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Wed, 12 Mar 2025 15:45:44 -0700 Subject: [PATCH] feat: Add new compact MetricInResponse type (#1593) # What does this PR do? This change adds a compact type to include metrics in response as opposed to the full MetricEvent which is relevant for internal logging purposes. ## Test Plan ``` LLAMA_STACK_CONFIG=~/.llama/distributions/fireworks/fireworks-run.yaml pytest -s -v agents/test_agents.py --safety-shield meta-llama/Llama-Guard-3-8B --text-model meta-llama/Llama-3.1-8B-Instruct llama stack run ~/.llama/distributions/fireworks/fireworks-run.yaml curl --request POST \ --url http://localhost:8321/v1/inference/chat-completion \ --header 'content-type: application/json' \ --data '{ "model_id": "meta-llama/Llama-3.1-70B-Instruct", "messages": [ { "role": "user", "content": { "type": "text", "text": "where do humans live" } } ], "stream": false }' { "metrics": [ { "metric": "prompt_tokens", "value": 10, "unit": null }, { "metric": "completion_tokens", "value": 522, "unit": null }, { "metric": "total_tokens", "value": 532, "unit": null } ], "completion_message": { "role": "assistant", "content": "Humans live in various parts of the world...............", "stop_reason": "out_of_tokens", "tool_calls": [] }, "logprobs": null } ``` --- docs/_static/llama-stack-spec.html | 133 +++++++++++++------- docs/_static/llama-stack-spec.yaml | 82 +++++++----- llama_stack/apis/telemetry/telemetry.py | 9 +- llama_stack/distribution/routers/routers.py | 6 +- 4 files changed, 150 insertions(+), 80 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 709360ede..dbd530aa3 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -4549,7 +4549,7 @@ "metrics": { "type": "array", "items": { - "$ref": "#/components/schemas/MetricEvent" + "$ref": "#/components/schemas/MetricInResponse" } }, "completion_message": { @@ -4571,46 +4571,9 @@ "title": "ChatCompletionResponse", "description": "Response from a chat completion request." }, - "MetricEvent": { + "MetricInResponse": { "type": "object", "properties": { - "trace_id": { - "type": "string" - }, - "span_id": { - "type": "string" - }, - "timestamp": { - "type": "string", - "format": "date-time" - }, - "attributes": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "integer" - }, - { - "type": "number" - }, - { - "type": "boolean" - }, - { - "type": "null" - } - ] - } - }, - "type": { - "type": "string", - "const": "metric", - "default": "metric" - }, "metric": { "type": "string" }, @@ -4630,15 +4593,10 @@ }, "additionalProperties": false, "required": [ - "trace_id", - "span_id", - "timestamp", - "type", "metric", - "value", - "unit" + "value" ], - "title": "MetricEvent" + "title": "MetricInResponse" }, "TokenLogProbs": { "type": "object", @@ -4715,6 +4673,12 @@ "CompletionResponse": { "type": "object", "properties": { + "metrics": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricInResponse" + } + }, "content": { "type": "string", "description": "The generated completion text" @@ -4924,7 +4888,7 @@ "metrics": { "type": "array", "items": { - "$ref": "#/components/schemas/MetricEvent" + "$ref": "#/components/schemas/MetricInResponse" } }, "event": { @@ -5082,6 +5046,12 @@ "CompletionResponseStreamChunk": { "type": "object", "properties": { + "metrics": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricInResponse" + } + }, "delta": { "type": "string", "description": "New content generated since last chunk. This can be one or more tokens." @@ -8363,6 +8333,75 @@ ], "title": "LogSeverity" }, + "MetricEvent": { + "type": "object", + "properties": { + "trace_id": { + "type": "string" + }, + "span_id": { + "type": "string" + }, + "timestamp": { + "type": "string", + "format": "date-time" + }, + "attributes": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "null" + } + ] + } + }, + "type": { + "type": "string", + "const": "metric", + "default": "metric" + }, + "metric": { + "type": "string" + }, + "value": { + "oneOf": [ + { + "type": "integer" + }, + { + "type": "number" + } + ] + }, + "unit": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "trace_id", + "span_id", + "timestamp", + "type", + "metric", + "value", + "unit" + ], + "title": "MetricEvent" + }, "SpanEndPayload": { "type": "object", "properties": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 4c00fbe63..cca1872a4 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -3101,7 +3101,7 @@ components: metrics: type: array items: - $ref: '#/components/schemas/MetricEvent' + $ref: '#/components/schemas/MetricInResponse' completion_message: $ref: '#/components/schemas/CompletionMessage' description: The complete response message @@ -3116,29 +3116,9 @@ components: - completion_message title: ChatCompletionResponse description: Response from a chat completion request. - MetricEvent: + MetricInResponse: type: object properties: - trace_id: - type: string - span_id: - type: string - timestamp: - type: string - format: date-time - attributes: - type: object - additionalProperties: - oneOf: - - type: string - - type: integer - - type: number - - type: boolean - - type: 'null' - type: - type: string - const: metric - default: metric metric: type: string value: @@ -3149,14 +3129,9 @@ components: type: string additionalProperties: false required: - - trace_id - - span_id - - timestamp - - type - metric - value - - unit - title: MetricEvent + title: MetricInResponse TokenLogProbs: type: object properties: @@ -3213,6 +3188,10 @@ components: CompletionResponse: type: object properties: + metrics: + type: array + items: + $ref: '#/components/schemas/MetricInResponse' content: type: string description: The generated completion text @@ -3412,7 +3391,7 @@ components: metrics: type: array items: - $ref: '#/components/schemas/MetricEvent' + $ref: '#/components/schemas/MetricInResponse' event: $ref: '#/components/schemas/ChatCompletionResponseEvent' description: The event containing the new content @@ -3531,6 +3510,10 @@ components: CompletionResponseStreamChunk: type: object properties: + metrics: + type: array + items: + $ref: '#/components/schemas/MetricInResponse' delta: type: string description: >- @@ -5703,6 +5686,47 @@ components: - error - critical title: LogSeverity + MetricEvent: + type: object + properties: + trace_id: + type: string + span_id: + type: string + timestamp: + type: string + format: date-time + attributes: + type: object + additionalProperties: + oneOf: + - type: string + - type: integer + - type: number + - type: boolean + - type: 'null' + type: + type: string + const: metric + default: metric + metric: + type: string + value: + oneOf: + - type: integer + - type: number + unit: + type: string + additionalProperties: false + required: + - trace_id + - span_id + - timestamp + - type + - metric + - value + - unit + title: MetricEvent SpanEndPayload: type: object properties: diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py index fe75677e7..cbea57e79 100644 --- a/llama_stack/apis/telemetry/telemetry.py +++ b/llama_stack/apis/telemetry/telemetry.py @@ -96,6 +96,13 @@ class MetricEvent(EventCommon): unit: str +@json_schema_type +class MetricInResponse(BaseModel): + metric: str + value: Union[int, float] + unit: Optional[str] = None + + # This is a short term solution to allow inference API to return metrics # The ideal way to do this is to have a way for all response types to include metrics # and all metric events logged to the telemetry API to be inlcuded with the response @@ -117,7 +124,7 @@ class MetricEvent(EventCommon): class MetricResponseMixin(BaseModel): - metrics: Optional[List[MetricEvent]] = None + metrics: Optional[List[MetricInResponse]] = None @json_schema_type diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index 34102d04b..22a1e46f9 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -48,7 +48,7 @@ from llama_stack.apis.scoring import ( ScoringFnParams, ) from llama_stack.apis.shields import Shield -from llama_stack.apis.telemetry import MetricEvent, Telemetry +from llama_stack.apis.telemetry import MetricEvent, MetricInResponse, Telemetry from llama_stack.apis.tools import ( RAGDocument, RAGQueryConfig, @@ -206,12 +206,12 @@ class InferenceRouter(Inference): completion_tokens: int, total_tokens: int, model: Model, - ) -> List[MetricEvent]: + ) -> List[MetricInResponse]: metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model) if self.telemetry: for metric in metrics: await self.telemetry.log_event(metric) - return metrics + return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics] async def _count_tokens( self,