diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 9ddb070d7..fd782f6c9 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -6372,6 +6372,9 @@ "$ref": "#/components/schemas/TokenLogProbs" }, "description": "Optional log probabilities for generated tokens" + }, + "usage": { + "$ref": "#/components/schemas/UsageInfo" } }, "additionalProperties": false, @@ -6430,6 +6433,31 @@ "title": "TokenLogProbs", "description": "Log probabilities for generated tokens." }, + "UsageInfo": { + "type": "object", + "properties": { + "completion_tokens": { + "type": "integer", + "description": "Number of tokens generated" + }, + "prompt_tokens": { + "type": "integer", + "description": "Number of tokens in the prompt" + }, + "total_tokens": { + "type": "integer", + "description": "Total number of tokens processed" + } + }, + "additionalProperties": false, + "required": [ + "completion_tokens", + "prompt_tokens", + "total_tokens" + ], + "title": "UsageInfo", + "description": "Usage information for a model." + }, "BatchCompletionRequest": { "type": "object", "properties": { @@ -10939,6 +10967,31 @@ "title": "OpenAIChatCompletionToolCallFunction", "description": "Function call details for OpenAI-compatible tool calls." }, + "OpenAIChatCompletionUsage": { + "type": "object", + "properties": { + "prompt_tokens": { + "type": "integer", + "description": "The number of tokens in the prompt" + }, + "completion_tokens": { + "type": "integer", + "description": "The number of tokens in the completion" + }, + "total_tokens": { + "type": "integer", + "description": "The total number of tokens used" + } + }, + "additionalProperties": false, + "required": [ + "prompt_tokens", + "completion_tokens", + "total_tokens" + ], + "title": "OpenAIChatCompletionUsage", + "description": "Usage information for an OpenAI-compatible chat completion response." + }, "OpenAIChoice": { "type": "object", "properties": { @@ -11276,6 +11329,13 @@ "OpenAICompletionWithInputMessages": { "type": "object", "properties": { + "metrics": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricInResponse" + }, + "description": "(Optional) List of metrics associated with the API response" + }, "id": { "type": "string", "description": "The ID of the chat completion" @@ -11301,6 +11361,9 @@ "type": "string", "description": "The model that was used to generate the chat completion" }, + "usage": { + "$ref": "#/components/schemas/OpenAIChatCompletionUsage" + }, "input_messages": { "type": "array", "items": { @@ -13062,6 +13125,13 @@ "items": { "type": "object", "properties": { + "metrics": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricInResponse" + }, + "description": "(Optional) List of metrics associated with the API response" + }, "id": { "type": "string", "description": "The ID of the chat completion" @@ -13087,6 +13157,9 @@ "type": "string", "description": "The model that was used to generate the chat completion" }, + "usage": { + "$ref": "#/components/schemas/OpenAIChatCompletionUsage" + }, "input_messages": { "type": "array", "items": { @@ -14478,6 +14551,13 @@ "OpenAIChatCompletion": { "type": "object", "properties": { + "metrics": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricInResponse" + }, + "description": "(Optional) List of metrics associated with the API response" + }, "id": { "type": "string", "description": "The ID of the chat completion" @@ -14502,6 +14582,9 @@ "model": { "type": "string", "description": "The model that was used to generate the chat completion" + }, + "usage": { + "$ref": "#/components/schemas/OpenAIChatCompletionUsage" } }, "additionalProperties": false, diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 94dc5c0f9..d0096e268 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -4548,6 +4548,8 @@ components: $ref: '#/components/schemas/TokenLogProbs' description: >- Optional log probabilities for generated tokens + usage: + $ref: '#/components/schemas/UsageInfo' additionalProperties: false required: - completion_message @@ -4589,6 +4591,25 @@ components: - logprobs_by_token title: TokenLogProbs description: Log probabilities for generated tokens. + UsageInfo: + type: object + properties: + completion_tokens: + type: integer + description: Number of tokens generated + prompt_tokens: + type: integer + description: Number of tokens in the prompt + total_tokens: + type: integer + description: Total number of tokens processed + additionalProperties: false + required: + - completion_tokens + - prompt_tokens + - total_tokens + title: UsageInfo + description: Usage information for a model. BatchCompletionRequest: type: object properties: @@ -8103,6 +8124,26 @@ components: title: OpenAIChatCompletionToolCallFunction description: >- Function call details for OpenAI-compatible tool calls. + OpenAIChatCompletionUsage: + type: object + properties: + prompt_tokens: + type: integer + description: The number of tokens in the prompt + completion_tokens: + type: integer + description: The number of tokens in the completion + total_tokens: + type: integer + description: The total number of tokens used + additionalProperties: false + required: + - prompt_tokens + - completion_tokens + - total_tokens + title: OpenAIChatCompletionUsage + description: >- + Usage information for an OpenAI-compatible chat completion response. OpenAIChoice: type: object properties: @@ -8365,6 +8406,12 @@ components: OpenAICompletionWithInputMessages: type: object properties: + metrics: + type: array + items: + $ref: '#/components/schemas/MetricInResponse' + description: >- + (Optional) List of metrics associated with the API response id: type: string description: The ID of the chat completion @@ -8387,6 +8434,8 @@ components: type: string description: >- The model that was used to generate the chat completion + usage: + $ref: '#/components/schemas/OpenAIChatCompletionUsage' input_messages: type: array items: @@ -9682,6 +9731,12 @@ components: items: type: object properties: + metrics: + type: array + items: + $ref: '#/components/schemas/MetricInResponse' + description: >- + (Optional) List of metrics associated with the API response id: type: string description: The ID of the chat completion @@ -9704,6 +9759,8 @@ components: type: string description: >- The model that was used to generate the chat completion + usage: + $ref: '#/components/schemas/OpenAIChatCompletionUsage' input_messages: type: array items: @@ -10719,6 +10776,12 @@ components: OpenAIChatCompletion: type: object properties: + metrics: + type: array + items: + $ref: '#/components/schemas/MetricInResponse' + description: >- + (Optional) List of metrics associated with the API response id: type: string description: The ID of the chat completion @@ -10741,6 +10804,8 @@ components: type: string description: >- The model that was used to generate the chat completion + usage: + $ref: '#/components/schemas/OpenAIChatCompletionUsage' additionalProperties: false required: - id diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index bd4737ca7..1b7869a30 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -451,6 +451,20 @@ class ChatCompletionResponseStreamChunk(MetricResponseMixin): event: ChatCompletionResponseEvent +@json_schema_type +class UsageInfo(BaseModel): + """Usage information for a model. + + :param completion_tokens: Number of tokens generated + :param prompt_tokens: Number of tokens in the prompt + :param total_tokens: Total number of tokens processed + """ + + completion_tokens: int + prompt_tokens: int + total_tokens: int + + @json_schema_type class ChatCompletionResponse(MetricResponseMixin): """Response from a chat completion request. @@ -461,6 +475,7 @@ class ChatCompletionResponse(MetricResponseMixin): completion_message: CompletionMessage logprobs: list[TokenLogProbs] | None = None + usage: UsageInfo | None = None @json_schema_type @@ -818,7 +833,21 @@ class OpenAIChoice(BaseModel): @json_schema_type -class OpenAIChatCompletion(BaseModel): +class OpenAIChatCompletionUsage(BaseModel): + """Usage information for an OpenAI-compatible chat completion response. + + :param prompt_tokens: The number of tokens in the prompt + :param completion_tokens: The number of tokens in the completion + :param total_tokens: The total number of tokens used + """ + + prompt_tokens: int + completion_tokens: int + total_tokens: int + + +@json_schema_type +class OpenAIChatCompletion(MetricResponseMixin): """Response from an OpenAI-compatible chat completion request. :param id: The ID of the chat completion @@ -833,6 +862,7 @@ class OpenAIChatCompletion(BaseModel): object: Literal["chat.completion"] = "chat.completion" created: int model: str + usage: OpenAIChatCompletionUsage | None = None @json_schema_type diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py index 762d7073e..1c356d1f1 100644 --- a/llama_stack/core/routers/inference.py +++ b/llama_stack/core/routers/inference.py @@ -590,6 +590,7 @@ class InferenceRouter(Inference): async def _nonstream_openai_chat_completion(self, provider: Inference, params: dict) -> OpenAIChatCompletion: response = await provider.openai_chat_completion(**params) + for choice in response.choices: # some providers return an empty list for no tool calls in non-streaming responses # but the OpenAI API returns None. So, set tool_calls to None if it's empty @@ -739,7 +740,6 @@ class InferenceRouter(Inference): id = None created = None choices_data: dict[int, dict[str, Any]] = {} - try: async for chunk in response: # Skip None chunks diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py index 2c01d192c..fc77a7214 100644 --- a/llama_stack/providers/remote/inference/fireworks/fireworks.py +++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py @@ -130,7 +130,7 @@ class FireworksInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, Nee async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator: params = await self._get_params(request) - stream = self.client.completions.create(**params) + stream = await self.client.completions.create(**params) async for chunk in process_completion_stream_response(stream): yield chunk @@ -208,9 +208,9 @@ class FireworksInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, Nee params = await self._get_params(request) if "messages" in params: - stream = self.client.chat.completions.create(**params) + stream = await self.client.chat.completions.create(**params) else: - stream = self.client.completions.create(**params) + stream = await self.client.completions.create(**params) async for chunk in process_chat_completion_stream_response(stream, request): yield chunk diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py index 55c2ac0ad..3ef4fb134 100644 --- a/llama_stack/providers/utils/inference/openai_compat.py +++ b/llama_stack/providers/utils/inference/openai_compat.py @@ -31,6 +31,8 @@ from openai.types.chat import ( ChatCompletionContentPartTextParam as OpenAIChatCompletionContentPartTextParam, ) +from llama_stack.apis.inference.inference import UsageInfo + try: from openai.types.chat import ( ChatCompletionMessageFunctionToolCall as OpenAIChatCompletionMessageFunctionToolCall, @@ -103,6 +105,7 @@ from llama_stack.apis.inference import ( JsonSchemaResponseFormat, Message, OpenAIChatCompletion, + OpenAIChatCompletionUsage, OpenAICompletion, OpenAICompletionChoice, OpenAIEmbeddingData, @@ -277,6 +280,11 @@ def process_chat_completion_response( request: ChatCompletionRequest, ) -> ChatCompletionResponse: choice = response.choices[0] + usage = UsageInfo( + prompt_tokens=response.usage.prompt_tokens, + completion_tokens=response.usage.completion_tokens, + total_tokens=response.usage.total_tokens, + ) if choice.finish_reason == "tool_calls": if not choice.message or not choice.message.tool_calls: raise ValueError("Tool calls are not present in the response") @@ -290,6 +298,7 @@ def process_chat_completion_response( content=json.dumps(tool_calls, default=lambda x: x.model_dump()), ), logprobs=None, + usage=usage, ) else: # Otherwise, return tool calls as normal @@ -301,6 +310,7 @@ def process_chat_completion_response( content="", ), logprobs=None, + usage=usage, ) # TODO: This does not work well with tool calls for vLLM remote provider @@ -335,6 +345,7 @@ def process_chat_completion_response( tool_calls=raw_message.tool_calls, ), logprobs=None, + usage=usage, ) @@ -646,7 +657,7 @@ async def convert_message_to_openai_dict_new( arguments=json.dumps(tool.arguments), ), type="function", - ) + ).model_dump() for tool in message.tool_calls ] params = {} @@ -657,6 +668,7 @@ async def convert_message_to_openai_dict_new( content=await _convert_message_content(message.content), **params, ) + elif isinstance(message, ToolResponseMessage): out = OpenAIChatCompletionToolMessage( role="tool", @@ -1375,6 +1387,7 @@ class OpenAIChatCompletionToLlamaStackMixin: user: str | None = None, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: messages = openai_messages_to_messages(messages) + response_format = _convert_openai_request_response_format(response_format) sampling_params = _convert_openai_sampling_params( max_tokens=max_tokens, @@ -1401,7 +1414,6 @@ class OpenAIChatCompletionToLlamaStackMixin: tools=tools, ) outstanding_responses.append(response) - if stream: return OpenAIChatCompletionToLlamaStackMixin._process_stream_response(self, model, outstanding_responses) @@ -1476,12 +1488,22 @@ class OpenAIChatCompletionToLlamaStackMixin: self, model: str, outstanding_responses: list[Awaitable[ChatCompletionResponse]] ) -> OpenAIChatCompletion: choices = [] + total_prompt_tokens = 0 + total_completion_tokens = 0 + total_tokens = 0 + for outstanding_response in outstanding_responses: response = await outstanding_response completion_message = response.completion_message message = await convert_message_to_openai_dict_new(completion_message) finish_reason = _convert_stop_reason_to_openai_finish_reason(completion_message.stop_reason) + # Aggregate usage data + if response.usage: + total_prompt_tokens += response.usage.prompt_tokens + total_completion_tokens += response.usage.completion_tokens + total_tokens += response.usage.total_tokens + choice = OpenAIChatCompletionChoice( index=len(choices), message=message, @@ -1489,12 +1511,17 @@ class OpenAIChatCompletionToLlamaStackMixin: ) choices.append(choice) + usage = OpenAIChatCompletionUsage( + prompt_tokens=total_prompt_tokens, completion_tokens=total_completion_tokens, total_tokens=total_tokens + ) + return OpenAIChatCompletion( id=f"chatcmpl-{uuid.uuid4()}", choices=choices, created=int(time.time()), model=model, object="chat.completion", + usage=usage, ) diff --git a/tests/integration/suites.py b/tests/integration/suites.py index 231480447..e8b1b6973 100644 --- a/tests/integration/suites.py +++ b/tests/integration/suites.py @@ -108,6 +108,15 @@ SETUP_DEFINITIONS: dict[str, Setup] = { "embedding_model": "together/togethercomputer/m2-bert-80M-32k-retrieval", }, ), + "fireworks": Setup( + name="fireworks", + description="Fireworks provider with a text model", + defaults={ + "text_model": "fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct", + "vision_model": "fireworks/accounts/fireworks/models/llama-v3p2-90b-vision-instruct", + "embedding_model": "nomic-ai/nomic-embed-text-v1.5", + }, + ), }