test-fireworks-fix

2025-12-18 07:39:47 +00:00 · 2025-09-11 15:58:38 -07:00 · 2025-09-11 15:58:38 -07:00 · f9348a6bdf
commit f9348a6bdf
parent 69a52213a1
4 changed files with 210 additions and 2 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -6304,6 +6304,9 @@
                            "$ref": "#/components/schemas/TokenLogProbs"
                        },
                        "description": "Optional log probabilities for generated tokens"
+                    },
+                    "usage": {
+                        "$ref": "#/components/schemas/UsageInfo"
                    }
                },
                "additionalProperties": false,
@ -6362,6 +6365,31 @@
                "title": "TokenLogProbs",
                "description": "Log probabilities for generated tokens."
            },
+            "UsageInfo": {
+                "type": "object",
+                "properties": {
+                    "completion_tokens": {
+                        "type": "integer",
+                        "description": "Number of tokens generated"
+                    },
+                    "prompt_tokens": {
+                        "type": "integer",
+                        "description": "Number of tokens in the prompt"
+                    },
+                    "total_tokens": {
+                        "type": "integer",
+                        "description": "Total number of tokens processed"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "completion_tokens",
+                    "prompt_tokens",
+                    "total_tokens"
+                ],
+                "title": "UsageInfo",
+                "description": "Usage information for a model."
+            },
            "BatchCompletionRequest": {
                "type": "object",
                "properties": {
@ -10871,6 +10899,31 @@
                "title": "OpenAIChatCompletionToolCallFunction",
                "description": "Function call details for OpenAI-compatible tool calls."
            },
+            "OpenAIChatCompletionUsage": {
+                "type": "object",
+                "properties": {
+                    "prompt_tokens": {
+                        "type": "integer",
+                        "description": "The number of tokens in the prompt"
+                    },
+                    "completion_tokens": {
+                        "type": "integer",
+                        "description": "The number of tokens in the completion"
+                    },
+                    "total_tokens": {
+                        "type": "integer",
+                        "description": "The total number of tokens used"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "prompt_tokens",
+                    "completion_tokens",
+                    "total_tokens"
+                ],
+                "title": "OpenAIChatCompletionUsage",
+                "description": "Usage information for an OpenAI-compatible chat completion response."
+            },
            "OpenAIChoice": {
                "type": "object",
                "properties": {
@ -11208,6 +11261,13 @@
            "OpenAICompletionWithInputMessages": {
                "type": "object",
                "properties": {
+                    "metrics": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/MetricInResponse"
+                        },
+                        "description": "(Optional) List of metrics associated with the API response"
+                    },
                    "id": {
                        "type": "string",
                        "description": "The ID of the chat completion"
@ -11233,6 +11293,9 @@
                        "type": "string",
                        "description": "The model that was used to generate the chat completion"
                    },
+                    "usage": {
+                        "$ref": "#/components/schemas/OpenAIChatCompletionUsage"
+                    },
                    "input_messages": {
                        "type": "array",
                        "items": {
@ -12994,6 +13057,13 @@
                        "items": {
                            "type": "object",
                            "properties": {
+                                "metrics": {
+                                    "type": "array",
+                                    "items": {
+                                        "$ref": "#/components/schemas/MetricInResponse"
+                                    },
+                                    "description": "(Optional) List of metrics associated with the API response"
+                                },
                                "id": {
                                    "type": "string",
                                    "description": "The ID of the chat completion"
@ -13019,6 +13089,9 @@
                                    "type": "string",
                                    "description": "The model that was used to generate the chat completion"
                                },
+                                "usage": {
+                                    "$ref": "#/components/schemas/OpenAIChatCompletionUsage"
+                                },
                                "input_messages": {
                                    "type": "array",
                                    "items": {
@ -14410,6 +14483,13 @@
            "OpenAIChatCompletion": {
                "type": "object",
                "properties": {
+                    "metrics": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/MetricInResponse"
+                        },
+                        "description": "(Optional) List of metrics associated with the API response"
+                    },
                    "id": {
                        "type": "string",
                        "description": "The ID of the chat completion"
@ -14434,6 +14514,9 @@
                    "model": {
                        "type": "string",
                        "description": "The model that was used to generate the chat completion"
+                    },
+                    "usage": {
+                        "$ref": "#/components/schemas/OpenAIChatCompletionUsage"
                    }
                },
                "additionalProperties": false,
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -4499,6 +4499,8 @@ components:
            $ref: '#/components/schemas/TokenLogProbs'
          description: >-
            Optional log probabilities for generated tokens
+        usage:
+          $ref: '#/components/schemas/UsageInfo'
      additionalProperties: false
      required:
        - completion_message
@ -4540,6 +4542,25 @@ components:
        - logprobs_by_token
      title: TokenLogProbs
      description: Log probabilities for generated tokens.
+    UsageInfo:
+      type: object
+      properties:
+        completion_tokens:
+          type: integer
+          description: Number of tokens generated
+        prompt_tokens:
+          type: integer
+          description: Number of tokens in the prompt
+        total_tokens:
+          type: integer
+          description: Total number of tokens processed
+      additionalProperties: false
+      required:
+        - completion_tokens
+        - prompt_tokens
+        - total_tokens
+      title: UsageInfo
+      description: Usage information for a model.
    BatchCompletionRequest:
      type: object
      properties:
@ -8054,6 +8075,26 @@ components:
      title: OpenAIChatCompletionToolCallFunction
      description: >-
        Function call details for OpenAI-compatible tool calls.
+    OpenAIChatCompletionUsage:
+      type: object
+      properties:
+        prompt_tokens:
+          type: integer
+          description: The number of tokens in the prompt
+        completion_tokens:
+          type: integer
+          description: The number of tokens in the completion
+        total_tokens:
+          type: integer
+          description: The total number of tokens used
+      additionalProperties: false
+      required:
+        - prompt_tokens
+        - completion_tokens
+        - total_tokens
+      title: OpenAIChatCompletionUsage
+      description: >-
+        Usage information for an OpenAI-compatible chat completion response.
    OpenAIChoice:
      type: object
      properties:
@ -8316,6 +8357,12 @@ components:
    OpenAICompletionWithInputMessages:
      type: object
      properties:
+        metrics:
+          type: array
+          items:
+            $ref: '#/components/schemas/MetricInResponse'
+          description: >-
+            (Optional) List of metrics associated with the API response
        id:
          type: string
          description: The ID of the chat completion
@ -8338,6 +8385,8 @@ components:
          type: string
          description: >-
            The model that was used to generate the chat completion
+        usage:
+          $ref: '#/components/schemas/OpenAIChatCompletionUsage'
        input_messages:
          type: array
          items:
@ -9633,6 +9682,12 @@ components:
          items:
            type: object
            properties:
+              metrics:
+                type: array
+                items:
+                  $ref: '#/components/schemas/MetricInResponse'
+                description: >-
+                  (Optional) List of metrics associated with the API response
              id:
                type: string
                description: The ID of the chat completion
@ -9655,6 +9710,8 @@ components:
                type: string
                description: >-
                  The model that was used to generate the chat completion
+              usage:
+                $ref: '#/components/schemas/OpenAIChatCompletionUsage'
              input_messages:
                type: array
                items:
@ -10670,6 +10727,12 @@ components:
    OpenAIChatCompletion:
      type: object
      properties:
+        metrics:
+          type: array
+          items:
+            $ref: '#/components/schemas/MetricInResponse'
+          description: >-
+            (Optional) List of metrics associated with the API response
        id:
          type: string
          description: The ID of the chat completion
@ -10692,6 +10755,8 @@ components:
          type: string
          description: >-
            The model that was used to generate the chat completion
+        usage:
+          $ref: '#/components/schemas/OpenAIChatCompletionUsage'
      additionalProperties: false
      required:
        - id
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -451,6 +451,20 @@ class ChatCompletionResponseStreamChunk(MetricResponseMixin):
    event: ChatCompletionResponseEvent


+@json_schema_type
+class UsageInfo(BaseModel):
+    """Usage information for a model.
+
+    :param completion_tokens: Number of tokens generated
+    :param prompt_tokens: Number of tokens in the prompt
+    :param total_tokens: Total number of tokens processed
+    """
+
+    completion_tokens: int
+    prompt_tokens: int
+    total_tokens: int
+
+
@json_schema_type
 class ChatCompletionResponse(MetricResponseMixin):
    """Response from a chat completion request.
@ -461,6 +475,7 @@ class ChatCompletionResponse(MetricResponseMixin):

    completion_message: CompletionMessage
    logprobs: list[TokenLogProbs] | None = None
+    usage: UsageInfo | None = None


@json_schema_type
@ -818,7 +833,21 @@ class OpenAIChoice(BaseModel):


@json_schema_type
-class OpenAIChatCompletion(BaseModel):
+class OpenAIChatCompletionUsage(BaseModel):
+    """Usage information for an OpenAI-compatible chat completion response.
+
+    :param prompt_tokens: The number of tokens in the prompt
+    :param completion_tokens: The number of tokens in the completion
+    :param total_tokens: The total number of tokens used
+    """
+
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+
+
+@json_schema_type
+class OpenAIChatCompletion(MetricResponseMixin):
    """Response from an OpenAI-compatible chat completion request.

    :param id: The ID of the chat completion
@ -833,6 +862,7 @@ class OpenAIChatCompletion(BaseModel):
    object: Literal["chat.completion"] = "chat.completion"
    created: int
    model: str
+    usage: OpenAIChatCompletionUsage | None = None


@json_schema_type
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -31,6 +31,8 @@ from openai.types.chat import (
    ChatCompletionContentPartTextParam as OpenAIChatCompletionContentPartTextParam,
 )

+from llama_stack.apis.inference.inference import UsageInfo
+
 try:
    from openai.types.chat import (
        ChatCompletionMessageFunctionToolCall as OpenAIChatCompletionMessageFunctionToolCall,
@ -103,6 +105,7 @@ from llama_stack.apis.inference import (
    JsonSchemaResponseFormat,
    Message,
    OpenAIChatCompletion,
+    OpenAIChatCompletionUsage,
    OpenAICompletion,
    OpenAICompletionChoice,
    OpenAIEmbeddingData,
@ -277,6 +280,11 @@ def process_chat_completion_response(
    request: ChatCompletionRequest,
 ) -> ChatCompletionResponse:
    choice = response.choices[0]
+    usage = UsageInfo(
+        prompt_tokens=response.usage.prompt_tokens,
+        completion_tokens=response.usage.completion_tokens,
+        total_tokens=response.usage.total_tokens,
+    )
    if choice.finish_reason == "tool_calls":
        if not choice.message or not choice.message.tool_calls:
            raise ValueError("Tool calls are not present in the response")
@ -290,6 +298,7 @@ def process_chat_completion_response(
                    content=json.dumps(tool_calls, default=lambda x: x.model_dump()),
                ),
                logprobs=None,
+                usage=usage,
            )
        else:
            # Otherwise, return tool calls as normal
@ -301,6 +310,7 @@ def process_chat_completion_response(
                    content="",
                ),
                logprobs=None,
+                usage=usage,
            )

    # TODO: This does not work well with tool calls for vLLM remote provider
@ -335,6 +345,7 @@ def process_chat_completion_response(
            tool_calls=raw_message.tool_calls,
        ),
        logprobs=None,
+        usage=usage,
    )


@ -1375,6 +1386,7 @@ class OpenAIChatCompletionToLlamaStackMixin:
        user: str | None = None,
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
        messages = openai_messages_to_messages(messages)
+
        response_format = _convert_openai_request_response_format(response_format)
        sampling_params = _convert_openai_sampling_params(
            max_tokens=max_tokens,
@ -1405,9 +1417,10 @@ class OpenAIChatCompletionToLlamaStackMixin:
        if stream:
            return OpenAIChatCompletionToLlamaStackMixin._process_stream_response(self, model, outstanding_responses)

-        return await OpenAIChatCompletionToLlamaStackMixin._process_non_stream_response(
+        response = await OpenAIChatCompletionToLlamaStackMixin._process_non_stream_response(
            self, model, outstanding_responses
        )
+        return response

    async def _process_stream_response(
        self,
@ -1476,12 +1489,22 @@ class OpenAIChatCompletionToLlamaStackMixin:
        self, model: str, outstanding_responses: list[Awaitable[ChatCompletionResponse]]
    ) -> OpenAIChatCompletion:
        choices = []
+        total_prompt_tokens = 0
+        total_completion_tokens = 0
+        total_tokens = 0
+
        for outstanding_response in outstanding_responses:
            response = await outstanding_response
            completion_message = response.completion_message
            message = await convert_message_to_openai_dict_new(completion_message)
            finish_reason = _convert_stop_reason_to_openai_finish_reason(completion_message.stop_reason)

+            # Aggregate usage data
+            if response.usage:
+                total_prompt_tokens += response.usage.prompt_tokens
+                total_completion_tokens += response.usage.completion_tokens
+                total_tokens += response.usage.total_tokens
+
            choice = OpenAIChatCompletionChoice(
                index=len(choices),
                message=message,
@ -1489,12 +1512,19 @@ class OpenAIChatCompletionToLlamaStackMixin:
            )
            choices.append(choice)

+        usage = None
+        if total_tokens > 0:
+            usage = OpenAIChatCompletionUsage(
+                prompt_tokens=total_prompt_tokens, completion_tokens=total_completion_tokens, total_tokens=total_tokens
+            )
+
        return OpenAIChatCompletion(
            id=f"chatcmpl-{uuid.uuid4()}",
            choices=choices,
            created=int(time.time()),
            model=model,
            object="chat.completion",
+            usage=usage,
        )