fix: OpenAI API - together.ai extra usage chunks

This fixes an issue where, with some models (ie the Llama 4 models), together.ai is sending a final usage chunk for streaming responses even if the user didn't ask to include usage. With this change, the OpenAI API verification tests now pass 100% when using Llama Stack as your API server and together.ai as the backend provider. As part of this, I also cleaned up the streaming/non-streaming return types of the `openai_chat_completion` method to keep type checking happy. Signed-off-by: Ben Browning <bbrownin@redhat.com>
2026-01-01 20:50:00 +00:00 · 2025-04-12 17:27:43 -04:00 · 2025-04-12 17:27:43 -04:00 · c014571258
commit c014571258
parent a4b573d750
12 changed files with 153 additions and 20 deletions
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -2135,11 +2135,15 @@ paths:
    post:
      responses:
        '200':
-          description: OK
+          description: >-
+            Response from an OpenAI-compatible chat completion request. **OR** Chunk
+            from a streaming response to an OpenAI-compatible chat completion request.
          content:
            application/json:
              schema:
-                $ref: '#/components/schemas/OpenAIChatCompletion'
+                oneOf:
+                  - $ref: '#/components/schemas/OpenAIChatCompletion'
+                  - $ref: '#/components/schemas/OpenAIChatCompletionChunk'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
@ -6507,6 +6511,41 @@ components:
      title: OpenAIChatCompletion
      description: >-
        Response from an OpenAI-compatible chat completion request.
+    OpenAIChatCompletionChunk:
+      type: object
+      properties:
+        id:
+          type: string
+          description: The ID of the chat completion
+        choices:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAIChoice'
+          description: List of choices
+        object:
+          type: string
+          const: chat.completion.chunk
+          default: chat.completion.chunk
+          description: >-
+            The object type, which will be "chat.completion.chunk"
+        created:
+          type: integer
+          description: >-
+            The Unix timestamp in seconds when the chat completion was created
+        model:
+          type: string
+          description: >-
+            The model that was used to generate the chat completion
+      additionalProperties: false
+      required:
+        - id
+        - choices
+        - object
+        - created
+        - model
+      title: OpenAIChatCompletionChunk
+      description: >-
+        Chunk from a streaming response to an OpenAI-compatible chat completion request.
    OpenAIChoice:
      type: object
      properties: