fix: OpenAI API - together.ai extra usage chunks

This fixes an issue where, with some models (ie the Llama 4 models), together.ai is sending a final usage chunk for streaming responses even if the user didn't ask to include usage. With this change, the OpenAI API verification tests now pass 100% when using Llama Stack as your API server and together.ai as the backend provider. As part of this, I also cleaned up the streaming/non-streaming return types of the `openai_chat_completion` method to keep type checking happy. Signed-off-by: Ben Browning <bbrownin@redhat.com>
2025-12-31 02:40:02 +00:00 · 2025-04-12 17:27:43 -04:00 · 2025-04-12 17:27:43 -04:00 · c014571258
commit c014571258
parent a4b573d750
12 changed files with 153 additions and 20 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -3096,11 +3096,18 @@
            "post": {
                "responses": {
                    "200": {
-                        "description": "OK",
+                        "description": "Response from an OpenAI-compatible chat completion request. **OR** Chunk from a streaming response to an OpenAI-compatible chat completion request.",
                        "content": {
                            "application/json": {
                                "schema": {
-                                    "$ref": "#/components/schemas/OpenAIChatCompletion"
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/OpenAIChatCompletion"
+                                        },
+                                        {
+                                            "$ref": "#/components/schemas/OpenAIChatCompletionChunk"
+                                        }
+                                    ]
                                }
                            }
                        }
@ -9506,6 +9513,46 @@
                "title": "OpenAIChatCompletion",
                "description": "Response from an OpenAI-compatible chat completion request."
            },
+            "OpenAIChatCompletionChunk": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "string",
+                        "description": "The ID of the chat completion"
+                    },
+                    "choices": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/OpenAIChoice"
+                        },
+                        "description": "List of choices"
+                    },
+                    "object": {
+                        "type": "string",
+                        "const": "chat.completion.chunk",
+                        "default": "chat.completion.chunk",
+                        "description": "The object type, which will be \"chat.completion.chunk\""
+                    },
+                    "created": {
+                        "type": "integer",
+                        "description": "The Unix timestamp in seconds when the chat completion was created"
+                    },
+                    "model": {
+                        "type": "string",
+                        "description": "The model that was used to generate the chat completion"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "id",
+                    "choices",
+                    "object",
+                    "created",
+                    "model"
+                ],
+                "title": "OpenAIChatCompletionChunk",
+                "description": "Chunk from a streaming response to an OpenAI-compatible chat completion request."
+            },
            "OpenAIChoice": {
                "type": "object",
                "properties": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -2135,11 +2135,15 @@ paths:
    post:
      responses:
        '200':
-          description: OK
+          description: >-
+            Response from an OpenAI-compatible chat completion request. **OR** Chunk
+            from a streaming response to an OpenAI-compatible chat completion request.
          content:
            application/json:
              schema:
-                $ref: '#/components/schemas/OpenAIChatCompletion'
+                oneOf:
+                  - $ref: '#/components/schemas/OpenAIChatCompletion'
+                  - $ref: '#/components/schemas/OpenAIChatCompletionChunk'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
@ -6507,6 +6511,41 @@ components:
      title: OpenAIChatCompletion
      description: >-
        Response from an OpenAI-compatible chat completion request.
+    OpenAIChatCompletionChunk:
+      type: object
+      properties:
+        id:
+          type: string
+          description: The ID of the chat completion
+        choices:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAIChoice'
+          description: List of choices
+        object:
+          type: string
+          const: chat.completion.chunk
+          default: chat.completion.chunk
+          description: >-
+            The object type, which will be "chat.completion.chunk"
+        created:
+          type: integer
+          description: >-
+            The Unix timestamp in seconds when the chat completion was created
+        model:
+          type: string
+          description: >-
+            The model that was used to generate the chat completion
+      additionalProperties: false
+      required:
+        - id
+        - choices
+        - object
+        - created
+        - model
+      title: OpenAIChatCompletionChunk
+      description: >-
+        Chunk from a streaming response to an OpenAI-compatible chat completion request.
    OpenAIChoice:
      type: object
      properties: