mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-31 06:43:52 +00:00
fix: OpenAI API - together.ai extra usage chunks
This fixes an issue where, with some models (ie the Llama 4 models), together.ai is sending a final usage chunk for streaming responses even if the user didn't ask to include usage. With this change, the OpenAI API verification tests now pass 100% when using Llama Stack as your API server and together.ai as the backend provider. As part of this, I also cleaned up the streaming/non-streaming return types of the `openai_chat_completion` method to keep type checking happy. Signed-off-by: Ben Browning <bbrownin@redhat.com>
This commit is contained in:
parent
a4b573d750
commit
c014571258
12 changed files with 153 additions and 20 deletions
|
|
@ -39,6 +39,7 @@ from llama_stack.apis.inference import (
|
|||
)
|
||||
from llama_stack.apis.inference.inference import (
|
||||
OpenAIChatCompletion,
|
||||
OpenAIChatCompletionChunk,
|
||||
OpenAICompletion,
|
||||
OpenAIMessageParam,
|
||||
OpenAIResponseFormatParam,
|
||||
|
|
@ -546,7 +547,7 @@ class InferenceRouter(Inference):
|
|||
top_logprobs: Optional[int] = None,
|
||||
top_p: Optional[float] = None,
|
||||
user: Optional[str] = None,
|
||||
) -> OpenAIChatCompletion:
|
||||
) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
|
||||
logger.debug(
|
||||
f"InferenceRouter.openai_chat_completion: {model=}, {stream=}, {messages=}",
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue