diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 151ac1451..75e0c4dfa 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -3106,6 +3106,12 @@
"ChatCompletionResponse": {
"type": "object",
"properties": {
+ "metrics": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/MetricEvent"
+ }
+ },
"completion_message": {
"$ref": "#/components/schemas/CompletionMessage",
"description": "The complete response message"
@@ -3124,6 +3130,77 @@
],
"description": "Response from a chat completion request."
},
+ "MetricEvent": {
+ "type": "object",
+ "properties": {
+ "trace_id": {
+ "type": "string"
+ },
+ "span_id": {
+ "type": "string"
+ },
+ "timestamp": {
+ "type": "string",
+ "format": "date-time"
+ },
+ "attributes": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ },
+ "type": {
+ "type": "string",
+ "const": "metric",
+ "default": "metric"
+ },
+ "metric": {
+ "type": "string"
+ },
+ "value": {
+ "oneOf": [
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ }
+ ]
+ },
+ "unit": {
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "trace_id",
+ "span_id",
+ "timestamp",
+ "type",
+ "metric",
+ "value",
+ "unit"
+ ]
+ },
"TokenLogProbs": {
"type": "object",
"properties": {
@@ -3388,6 +3465,12 @@
"ChatCompletionResponseStreamChunk": {
"type": "object",
"properties": {
+ "metrics": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/MetricEvent"
+ }
+ },
"event": {
"$ref": "#/components/schemas/ChatCompletionResponseEvent",
"description": "The event containing the new content"
@@ -6374,77 +6457,6 @@
"critical"
]
},
- "MetricEvent": {
- "type": "object",
- "properties": {
- "trace_id": {
- "type": "string"
- },
- "span_id": {
- "type": "string"
- },
- "timestamp": {
- "type": "string",
- "format": "date-time"
- },
- "attributes": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- },
- "type": {
- "type": "string",
- "const": "metric",
- "default": "metric"
- },
- "metric": {
- "type": "string"
- },
- "value": {
- "oneOf": [
- {
- "type": "integer"
- },
- {
- "type": "number"
- }
- ]
- },
- "unit": {
- "type": "string"
- }
- },
- "additionalProperties": false,
- "required": [
- "trace_id",
- "span_id",
- "timestamp",
- "type",
- "metric",
- "value",
- "unit"
- ]
- },
"SpanEndPayload": {
"type": "object",
"properties": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 37fba4541..c60a002e2 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -1925,6 +1925,10 @@ components:
ChatCompletionResponse:
type: object
properties:
+ metrics:
+ type: array
+ items:
+ $ref: '#/components/schemas/MetricEvent'
completion_message:
$ref: '#/components/schemas/CompletionMessage'
description: The complete response message
@@ -1938,6 +1942,47 @@ components:
required:
- completion_message
description: Response from a chat completion request.
+ MetricEvent:
+ type: object
+ properties:
+ trace_id:
+ type: string
+ span_id:
+ type: string
+ timestamp:
+ type: string
+ format: date-time
+ attributes:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ type:
+ type: string
+ const: metric
+ default: metric
+ metric:
+ type: string
+ value:
+ oneOf:
+ - type: integer
+ - type: number
+ unit:
+ type: string
+ additionalProperties: false
+ required:
+ - trace_id
+ - span_id
+ - timestamp
+ - type
+ - metric
+ - value
+ - unit
TokenLogProbs:
type: object
properties:
@@ -2173,6 +2218,10 @@ components:
ChatCompletionResponseStreamChunk:
type: object
properties:
+ metrics:
+ type: array
+ items:
+ $ref: '#/components/schemas/MetricEvent'
event:
$ref: '#/components/schemas/ChatCompletionResponseEvent'
description: The event containing the new content
@@ -4070,47 +4119,6 @@ components:
- warn
- error
- critical
- MetricEvent:
- type: object
- properties:
- trace_id:
- type: string
- span_id:
- type: string
- timestamp:
- type: string
- format: date-time
- attributes:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- type:
- type: string
- const: metric
- default: metric
- metric:
- type: string
- value:
- oneOf:
- - type: integer
- - type: number
- unit:
- type: string
- additionalProperties: false
- required:
- - trace_id
- - span_id
- - timestamp
- - type
- - metric
- - value
- - unit
SpanEndPayload:
type: object
properties:
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 4e095e831..9fccd3911 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -13,8 +13,8 @@ from typing import (
Literal,
Optional,
Protocol,
- runtime_checkable,
Union,
+ runtime_checkable,
)
from llama_models.llama3.api.datatypes import (
@@ -31,6 +31,7 @@ from typing_extensions import Annotated
from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
from llama_stack.apis.models import Model
+from llama_stack.apis.telemetry.telemetry import MetricResponseMixin
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@@ -357,7 +358,7 @@ class ChatCompletionRequest(BaseModel):
@json_schema_type
-class ChatCompletionResponseStreamChunk(BaseModel):
+class ChatCompletionResponseStreamChunk(MetricResponseMixin, BaseModel):
"""A chunk of a streamed chat completion response.
:param event: The event containing the new content
@@ -367,7 +368,7 @@ class ChatCompletionResponseStreamChunk(BaseModel):
@json_schema_type
-class ChatCompletionResponse(BaseModel):
+class ChatCompletionResponse(MetricResponseMixin, BaseModel):
"""Response from a chat completion request.
:param completion_message: The complete response message
diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py
index 324064007..6a62e274d 100644
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@@ -13,8 +13,8 @@ from typing import (
Literal,
Optional,
Protocol,
- runtime_checkable,
Union,
+ runtime_checkable,
)
from llama_models.schema_utils import json_schema_type, register_schema, webmethod
@@ -94,6 +94,30 @@ class MetricEvent(EventCommon):
unit: str
+# This is a short term solution to allow inference API to return metrics
+# The ideal way to do this is to have a way for all response types to include metrics
+# and all metric events logged to the telemetry API to be inlcuded with the response
+# To do this, we will need to augment all response types with a metrics field.
+# We have hit a blocker from stainless SDK that prevents us from doing this.
+# The blocker is that if we were to augment the response types that have a data field
+# in them like so
+# class ListModelsResponse(BaseModel):
+# metrics: Optional[List[MetricEvent]] = None
+# data: List[Models]
+# ...
+# The client SDK will need to access the data by using a .data field, which is not
+# ergonomic. Stainless SDK does support unwrapping the response type, but it
+# requires that the response type to only have a single field.
+
+# We will need a way in the client SDK to signal that the metrics are needed
+# and if they are needed, the client SDK has to return the full response type
+# without unwrapping it.
+
+
+class MetricResponseMixin(BaseModel):
+ metrics: Optional[List[MetricEvent]] = None
+
+
@json_schema_type
class StructuredLogType(Enum):
SPAN_START = "span_start"