diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 709360ede..dbd530aa3 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -4549,7 +4549,7 @@
"metrics": {
"type": "array",
"items": {
- "$ref": "#/components/schemas/MetricEvent"
+ "$ref": "#/components/schemas/MetricInResponse"
}
},
"completion_message": {
@@ -4571,46 +4571,9 @@
"title": "ChatCompletionResponse",
"description": "Response from a chat completion request."
},
- "MetricEvent": {
+ "MetricInResponse": {
"type": "object",
"properties": {
- "trace_id": {
- "type": "string"
- },
- "span_id": {
- "type": "string"
- },
- "timestamp": {
- "type": "string",
- "format": "date-time"
- },
- "attributes": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "string"
- },
- {
- "type": "integer"
- },
- {
- "type": "number"
- },
- {
- "type": "boolean"
- },
- {
- "type": "null"
- }
- ]
- }
- },
- "type": {
- "type": "string",
- "const": "metric",
- "default": "metric"
- },
"metric": {
"type": "string"
},
@@ -4630,15 +4593,10 @@
},
"additionalProperties": false,
"required": [
- "trace_id",
- "span_id",
- "timestamp",
- "type",
"metric",
- "value",
- "unit"
+ "value"
],
- "title": "MetricEvent"
+ "title": "MetricInResponse"
},
"TokenLogProbs": {
"type": "object",
@@ -4715,6 +4673,12 @@
"CompletionResponse": {
"type": "object",
"properties": {
+ "metrics": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/MetricInResponse"
+ }
+ },
"content": {
"type": "string",
"description": "The generated completion text"
@@ -4924,7 +4888,7 @@
"metrics": {
"type": "array",
"items": {
- "$ref": "#/components/schemas/MetricEvent"
+ "$ref": "#/components/schemas/MetricInResponse"
}
},
"event": {
@@ -5082,6 +5046,12 @@
"CompletionResponseStreamChunk": {
"type": "object",
"properties": {
+ "metrics": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/MetricInResponse"
+ }
+ },
"delta": {
"type": "string",
"description": "New content generated since last chunk. This can be one or more tokens."
@@ -8363,6 +8333,75 @@
],
"title": "LogSeverity"
},
+ "MetricEvent": {
+ "type": "object",
+ "properties": {
+ "trace_id": {
+ "type": "string"
+ },
+ "span_id": {
+ "type": "string"
+ },
+ "timestamp": {
+ "type": "string",
+ "format": "date-time"
+ },
+ "attributes": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "null"
+ }
+ ]
+ }
+ },
+ "type": {
+ "type": "string",
+ "const": "metric",
+ "default": "metric"
+ },
+ "metric": {
+ "type": "string"
+ },
+ "value": {
+ "oneOf": [
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ }
+ ]
+ },
+ "unit": {
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "trace_id",
+ "span_id",
+ "timestamp",
+ "type",
+ "metric",
+ "value",
+ "unit"
+ ],
+ "title": "MetricEvent"
+ },
"SpanEndPayload": {
"type": "object",
"properties": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 4c00fbe63..cca1872a4 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -3101,7 +3101,7 @@ components:
metrics:
type: array
items:
- $ref: '#/components/schemas/MetricEvent'
+ $ref: '#/components/schemas/MetricInResponse'
completion_message:
$ref: '#/components/schemas/CompletionMessage'
description: The complete response message
@@ -3116,29 +3116,9 @@ components:
- completion_message
title: ChatCompletionResponse
description: Response from a chat completion request.
- MetricEvent:
+ MetricInResponse:
type: object
properties:
- trace_id:
- type: string
- span_id:
- type: string
- timestamp:
- type: string
- format: date-time
- attributes:
- type: object
- additionalProperties:
- oneOf:
- - type: string
- - type: integer
- - type: number
- - type: boolean
- - type: 'null'
- type:
- type: string
- const: metric
- default: metric
metric:
type: string
value:
@@ -3149,14 +3129,9 @@ components:
type: string
additionalProperties: false
required:
- - trace_id
- - span_id
- - timestamp
- - type
- metric
- value
- - unit
- title: MetricEvent
+ title: MetricInResponse
TokenLogProbs:
type: object
properties:
@@ -3213,6 +3188,10 @@ components:
CompletionResponse:
type: object
properties:
+ metrics:
+ type: array
+ items:
+ $ref: '#/components/schemas/MetricInResponse'
content:
type: string
description: The generated completion text
@@ -3412,7 +3391,7 @@ components:
metrics:
type: array
items:
- $ref: '#/components/schemas/MetricEvent'
+ $ref: '#/components/schemas/MetricInResponse'
event:
$ref: '#/components/schemas/ChatCompletionResponseEvent'
description: The event containing the new content
@@ -3531,6 +3510,10 @@ components:
CompletionResponseStreamChunk:
type: object
properties:
+ metrics:
+ type: array
+ items:
+ $ref: '#/components/schemas/MetricInResponse'
delta:
type: string
description: >-
@@ -5703,6 +5686,47 @@ components:
- error
- critical
title: LogSeverity
+ MetricEvent:
+ type: object
+ properties:
+ trace_id:
+ type: string
+ span_id:
+ type: string
+ timestamp:
+ type: string
+ format: date-time
+ attributes:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: string
+ - type: integer
+ - type: number
+ - type: boolean
+ - type: 'null'
+ type:
+ type: string
+ const: metric
+ default: metric
+ metric:
+ type: string
+ value:
+ oneOf:
+ - type: integer
+ - type: number
+ unit:
+ type: string
+ additionalProperties: false
+ required:
+ - trace_id
+ - span_id
+ - timestamp
+ - type
+ - metric
+ - value
+ - unit
+ title: MetricEvent
SpanEndPayload:
type: object
properties:
diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py
index fe75677e7..cbea57e79 100644
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@@ -96,6 +96,13 @@ class MetricEvent(EventCommon):
unit: str
+@json_schema_type
+class MetricInResponse(BaseModel):
+ metric: str
+ value: Union[int, float]
+ unit: Optional[str] = None
+
+
# This is a short term solution to allow inference API to return metrics
# The ideal way to do this is to have a way for all response types to include metrics
# and all metric events logged to the telemetry API to be inlcuded with the response
@@ -117,7 +124,7 @@ class MetricEvent(EventCommon):
class MetricResponseMixin(BaseModel):
- metrics: Optional[List[MetricEvent]] = None
+ metrics: Optional[List[MetricInResponse]] = None
@json_schema_type
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index 34102d04b..22a1e46f9 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -48,7 +48,7 @@ from llama_stack.apis.scoring import (
ScoringFnParams,
)
from llama_stack.apis.shields import Shield
-from llama_stack.apis.telemetry import MetricEvent, Telemetry
+from llama_stack.apis.telemetry import MetricEvent, MetricInResponse, Telemetry
from llama_stack.apis.tools import (
RAGDocument,
RAGQueryConfig,
@@ -206,12 +206,12 @@ class InferenceRouter(Inference):
completion_tokens: int,
total_tokens: int,
model: Model,
- ) -> List[MetricEvent]:
+ ) -> List[MetricInResponse]:
metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model)
if self.telemetry:
for metric in metrics:
await self.telemetry.log_event(metric)
- return metrics
+ return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
async def _count_tokens(
self,