fix: OpenAI API - together.ai extra usage chunks

This fixes an issue where, with some models (ie the Llama 4 models), together.ai is sending a final usage chunk for streaming responses even if the user didn't ask to include usage. With this change, the OpenAI API verification tests now pass 100% when using Llama Stack as your API server and together.ai as the backend provider. As part of this, I also cleaned up the streaming/non-streaming return types of the `openai_chat_completion` method to keep type checking happy. Signed-off-by: Ben Browning <bbrownin@redhat.com>
2025-12-31 03:29:59 +00:00 · 2025-04-12 17:27:43 -04:00 · 2025-04-12 17:27:43 -04:00 · c014571258
commit c014571258
parent a4b573d750
12 changed files with 153 additions and 20 deletions
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, AsyncGenerator, Dict, List, Optional, Union
+from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union

 from openai import AsyncOpenAI
 from together import AsyncTogether
@ -33,6 +33,7 @@ from llama_stack.apis.inference import (
 )
 from llama_stack.apis.inference.inference import (
    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
    OpenAICompletion,
    OpenAIMessageParam,
    OpenAIResponseFormatParam,
@ -331,7 +332,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
        top_logprobs: Optional[int] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
-    ) -> OpenAIChatCompletion:
+    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
        model_obj = await self.model_store.get_model(model)
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
@ -358,4 +359,26 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
            top_p=top_p,
            user=user,
        )
+        if params.get("stream", True):
+            return self._stream_openai_chat_completion(params)
        return await self._get_openai_client().chat.completions.create(**params)  # type: ignore
+
+    async def _stream_openai_chat_completion(self, params: dict) -> AsyncGenerator:
+        # together.ai sometimes adds usage data to the stream, even if include_usage is False
+        # This causes an unexpected final chunk with empty choices array to be sent
+        # to clients that may not handle it gracefully.
+        include_usage = False
+        if params.get("stream_options", None):
+            include_usage = params["stream_options"].get("include_usage", False)
+        stream = await self._get_openai_client().chat.completions.create(**params)
+
+        seen_finish_reason = False
+        async for chunk in stream:
+            # Final usage chunk with no choices that the user didn't request, so discard
+            if not include_usage and seen_finish_reason and len(chunk.choices) == 0:
+                break
+            yield chunk
+            for choice in chunk.choices:
+                if choice.finish_reason:
+                    seen_finish_reason = True
+                    break