diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index a036e5dc0..a495aa55a 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -6304,6 +6304,9 @@ "$ref": "#/components/schemas/TokenLogProbs" }, "description": "Optional log probabilities for generated tokens" + }, + "usage": { + "$ref": "#/components/schemas/UsageInfo" } }, "additionalProperties": false, @@ -6362,6 +6365,31 @@ "title": "TokenLogProbs", "description": "Log probabilities for generated tokens." }, + "UsageInfo": { + "type": "object", + "properties": { + "completion_tokens": { + "type": "integer", + "description": "Number of tokens generated" + }, + "prompt_tokens": { + "type": "integer", + "description": "Number of tokens in the prompt" + }, + "total_tokens": { + "type": "integer", + "description": "Total number of tokens processed" + } + }, + "additionalProperties": false, + "required": [ + "completion_tokens", + "prompt_tokens", + "total_tokens" + ], + "title": "UsageInfo", + "description": "Usage information for a model." + }, "BatchCompletionRequest": { "type": "object", "properties": { @@ -10871,6 +10899,31 @@ "title": "OpenAIChatCompletionToolCallFunction", "description": "Function call details for OpenAI-compatible tool calls." }, + "OpenAIChatCompletionUsage": { + "type": "object", + "properties": { + "prompt_tokens": { + "type": "integer", + "description": "The number of tokens in the prompt" + }, + "completion_tokens": { + "type": "integer", + "description": "The number of tokens in the completion" + }, + "total_tokens": { + "type": "integer", + "description": "The total number of tokens used" + } + }, + "additionalProperties": false, + "required": [ + "prompt_tokens", + "completion_tokens", + "total_tokens" + ], + "title": "OpenAIChatCompletionUsage", + "description": "Usage information for an OpenAI-compatible chat completion response." + }, "OpenAIChoice": { "type": "object", "properties": { @@ -11208,6 +11261,13 @@ "OpenAICompletionWithInputMessages": { "type": "object", "properties": { + "metrics": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricInResponse" + }, + "description": "(Optional) List of metrics associated with the API response" + }, "id": { "type": "string", "description": "The ID of the chat completion" @@ -11233,6 +11293,9 @@ "type": "string", "description": "The model that was used to generate the chat completion" }, + "usage": { + "$ref": "#/components/schemas/OpenAIChatCompletionUsage" + }, "input_messages": { "type": "array", "items": { @@ -12994,6 +13057,13 @@ "items": { "type": "object", "properties": { + "metrics": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricInResponse" + }, + "description": "(Optional) List of metrics associated with the API response" + }, "id": { "type": "string", "description": "The ID of the chat completion" @@ -13019,6 +13089,9 @@ "type": "string", "description": "The model that was used to generate the chat completion" }, + "usage": { + "$ref": "#/components/schemas/OpenAIChatCompletionUsage" + }, "input_messages": { "type": "array", "items": { @@ -14410,6 +14483,13 @@ "OpenAIChatCompletion": { "type": "object", "properties": { + "metrics": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricInResponse" + }, + "description": "(Optional) List of metrics associated with the API response" + }, "id": { "type": "string", "description": "The ID of the chat completion" @@ -14434,6 +14514,9 @@ "model": { "type": "string", "description": "The model that was used to generate the chat completion" + }, + "usage": { + "$ref": "#/components/schemas/OpenAIChatCompletionUsage" } }, "additionalProperties": false, diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 8ed04c1f8..a2ce9a052 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -4499,6 +4499,8 @@ components: $ref: '#/components/schemas/TokenLogProbs' description: >- Optional log probabilities for generated tokens + usage: + $ref: '#/components/schemas/UsageInfo' additionalProperties: false required: - completion_message @@ -4540,6 +4542,25 @@ components: - logprobs_by_token title: TokenLogProbs description: Log probabilities for generated tokens. + UsageInfo: + type: object + properties: + completion_tokens: + type: integer + description: Number of tokens generated + prompt_tokens: + type: integer + description: Number of tokens in the prompt + total_tokens: + type: integer + description: Total number of tokens processed + additionalProperties: false + required: + - completion_tokens + - prompt_tokens + - total_tokens + title: UsageInfo + description: Usage information for a model. BatchCompletionRequest: type: object properties: @@ -8054,6 +8075,26 @@ components: title: OpenAIChatCompletionToolCallFunction description: >- Function call details for OpenAI-compatible tool calls. + OpenAIChatCompletionUsage: + type: object + properties: + prompt_tokens: + type: integer + description: The number of tokens in the prompt + completion_tokens: + type: integer + description: The number of tokens in the completion + total_tokens: + type: integer + description: The total number of tokens used + additionalProperties: false + required: + - prompt_tokens + - completion_tokens + - total_tokens + title: OpenAIChatCompletionUsage + description: >- + Usage information for an OpenAI-compatible chat completion response. OpenAIChoice: type: object properties: @@ -8316,6 +8357,12 @@ components: OpenAICompletionWithInputMessages: type: object properties: + metrics: + type: array + items: + $ref: '#/components/schemas/MetricInResponse' + description: >- + (Optional) List of metrics associated with the API response id: type: string description: The ID of the chat completion @@ -8338,6 +8385,8 @@ components: type: string description: >- The model that was used to generate the chat completion + usage: + $ref: '#/components/schemas/OpenAIChatCompletionUsage' input_messages: type: array items: @@ -9633,6 +9682,12 @@ components: items: type: object properties: + metrics: + type: array + items: + $ref: '#/components/schemas/MetricInResponse' + description: >- + (Optional) List of metrics associated with the API response id: type: string description: The ID of the chat completion @@ -9655,6 +9710,8 @@ components: type: string description: >- The model that was used to generate the chat completion + usage: + $ref: '#/components/schemas/OpenAIChatCompletionUsage' input_messages: type: array items: @@ -10670,6 +10727,12 @@ components: OpenAIChatCompletion: type: object properties: + metrics: + type: array + items: + $ref: '#/components/schemas/MetricInResponse' + description: >- + (Optional) List of metrics associated with the API response id: type: string description: The ID of the chat completion @@ -10692,6 +10755,8 @@ components: type: string description: >- The model that was used to generate the chat completion + usage: + $ref: '#/components/schemas/OpenAIChatCompletionUsage' additionalProperties: false required: - id diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index bd4737ca7..1b7869a30 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -451,6 +451,20 @@ class ChatCompletionResponseStreamChunk(MetricResponseMixin): event: ChatCompletionResponseEvent +@json_schema_type +class UsageInfo(BaseModel): + """Usage information for a model. + + :param completion_tokens: Number of tokens generated + :param prompt_tokens: Number of tokens in the prompt + :param total_tokens: Total number of tokens processed + """ + + completion_tokens: int + prompt_tokens: int + total_tokens: int + + @json_schema_type class ChatCompletionResponse(MetricResponseMixin): """Response from a chat completion request. @@ -461,6 +475,7 @@ class ChatCompletionResponse(MetricResponseMixin): completion_message: CompletionMessage logprobs: list[TokenLogProbs] | None = None + usage: UsageInfo | None = None @json_schema_type @@ -818,7 +833,21 @@ class OpenAIChoice(BaseModel): @json_schema_type -class OpenAIChatCompletion(BaseModel): +class OpenAIChatCompletionUsage(BaseModel): + """Usage information for an OpenAI-compatible chat completion response. + + :param prompt_tokens: The number of tokens in the prompt + :param completion_tokens: The number of tokens in the completion + :param total_tokens: The total number of tokens used + """ + + prompt_tokens: int + completion_tokens: int + total_tokens: int + + +@json_schema_type +class OpenAIChatCompletion(MetricResponseMixin): """Response from an OpenAI-compatible chat completion request. :param id: The ID of the chat completion @@ -833,6 +862,7 @@ class OpenAIChatCompletion(BaseModel): object: Literal["chat.completion"] = "chat.completion" created: int model: str + usage: OpenAIChatCompletionUsage | None = None @json_schema_type diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py index 55c2ac0ad..e5f01d601 100644 --- a/llama_stack/providers/utils/inference/openai_compat.py +++ b/llama_stack/providers/utils/inference/openai_compat.py @@ -31,6 +31,8 @@ from openai.types.chat import ( ChatCompletionContentPartTextParam as OpenAIChatCompletionContentPartTextParam, ) +from llama_stack.apis.inference.inference import UsageInfo + try: from openai.types.chat import ( ChatCompletionMessageFunctionToolCall as OpenAIChatCompletionMessageFunctionToolCall, @@ -103,6 +105,7 @@ from llama_stack.apis.inference import ( JsonSchemaResponseFormat, Message, OpenAIChatCompletion, + OpenAIChatCompletionUsage, OpenAICompletion, OpenAICompletionChoice, OpenAIEmbeddingData, @@ -277,6 +280,11 @@ def process_chat_completion_response( request: ChatCompletionRequest, ) -> ChatCompletionResponse: choice = response.choices[0] + usage = UsageInfo( + prompt_tokens=response.usage.prompt_tokens, + completion_tokens=response.usage.completion_tokens, + total_tokens=response.usage.total_tokens, + ) if choice.finish_reason == "tool_calls": if not choice.message or not choice.message.tool_calls: raise ValueError("Tool calls are not present in the response") @@ -290,6 +298,7 @@ def process_chat_completion_response( content=json.dumps(tool_calls, default=lambda x: x.model_dump()), ), logprobs=None, + usage=usage, ) else: # Otherwise, return tool calls as normal @@ -301,6 +310,7 @@ def process_chat_completion_response( content="", ), logprobs=None, + usage=usage, ) # TODO: This does not work well with tool calls for vLLM remote provider @@ -335,6 +345,7 @@ def process_chat_completion_response( tool_calls=raw_message.tool_calls, ), logprobs=None, + usage=usage, ) @@ -1375,6 +1386,7 @@ class OpenAIChatCompletionToLlamaStackMixin: user: str | None = None, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: messages = openai_messages_to_messages(messages) + response_format = _convert_openai_request_response_format(response_format) sampling_params = _convert_openai_sampling_params( max_tokens=max_tokens, @@ -1405,9 +1417,10 @@ class OpenAIChatCompletionToLlamaStackMixin: if stream: return OpenAIChatCompletionToLlamaStackMixin._process_stream_response(self, model, outstanding_responses) - return await OpenAIChatCompletionToLlamaStackMixin._process_non_stream_response( + response = await OpenAIChatCompletionToLlamaStackMixin._process_non_stream_response( self, model, outstanding_responses ) + return response async def _process_stream_response( self, @@ -1476,12 +1489,22 @@ class OpenAIChatCompletionToLlamaStackMixin: self, model: str, outstanding_responses: list[Awaitable[ChatCompletionResponse]] ) -> OpenAIChatCompletion: choices = [] + total_prompt_tokens = 0 + total_completion_tokens = 0 + total_tokens = 0 + for outstanding_response in outstanding_responses: response = await outstanding_response completion_message = response.completion_message message = await convert_message_to_openai_dict_new(completion_message) finish_reason = _convert_stop_reason_to_openai_finish_reason(completion_message.stop_reason) + # Aggregate usage data + if response.usage: + total_prompt_tokens += response.usage.prompt_tokens + total_completion_tokens += response.usage.completion_tokens + total_tokens += response.usage.total_tokens + choice = OpenAIChatCompletionChoice( index=len(choices), message=message, @@ -1489,12 +1512,19 @@ class OpenAIChatCompletionToLlamaStackMixin: ) choices.append(choice) + usage = None + if total_tokens > 0: + usage = OpenAIChatCompletionUsage( + prompt_tokens=total_prompt_tokens, completion_tokens=total_completion_tokens, total_tokens=total_tokens + ) + return OpenAIChatCompletion( id=f"chatcmpl-{uuid.uuid4()}", choices=choices, created=int(time.time()), model=model, object="chat.completion", + usage=usage, )