fix: OpenAI API - together.ai extra usage chunks

This fixes an issue where, with some models (ie the Llama 4 models),
together.ai is sending a final usage chunk for streaming responses
even if the user didn't ask to include usage.

With this change, the OpenAI API verification tests now pass 100% when
using Llama Stack as your API server and together.ai as the backend
provider.

As part of this, I also cleaned up the streaming/non-streaming return
types of the `openai_chat_completion` method to keep type checking happy.

Signed-off-by: Ben Browning <bbrownin@redhat.com>
This commit is contained in:
Ben Browning 2025-04-12 17:27:43 -04:00
parent a4b573d750
commit c014571258
12 changed files with 153 additions and 20 deletions

View file

@ -32,6 +32,7 @@ from llama_stack.apis.inference import (
)
from llama_stack.apis.inference.inference import (
OpenAIChatCompletion,
OpenAIChatCompletionChunk,
OpenAICompletion,
OpenAIMessageParam,
OpenAIResponseFormatParam,
@ -324,7 +325,7 @@ class LiteLLMOpenAIMixin(
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
) -> OpenAIChatCompletion:
) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
model_obj = await self.model_store.get_model(model)
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,

View file

@ -8,7 +8,7 @@ import logging
import time
import uuid
import warnings
from typing import Any, AsyncGenerator, Dict, Iterable, List, Optional, Union
from typing import Any, AsyncGenerator, AsyncIterator, Dict, Iterable, List, Optional, Union
from openai import AsyncStream
from openai.types.chat import (
@ -1196,5 +1196,5 @@ class OpenAIChatCompletionUnsupportedMixin:
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
) -> OpenAIChatCompletion:
) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
raise ValueError(f"{self.__class__.__name__} doesn't support openai chat completion")