mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-31 06:23:52 +00:00
fix: OpenAI API - together.ai extra usage chunks
This fixes an issue where, with some models (ie the Llama 4 models), together.ai is sending a final usage chunk for streaming responses even if the user didn't ask to include usage. With this change, the OpenAI API verification tests now pass 100% when using Llama Stack as your API server and together.ai as the backend provider. As part of this, I also cleaned up the streaming/non-streaming return types of the `openai_chat_completion` method to keep type checking happy. Signed-off-by: Ben Browning <bbrownin@redhat.com>
This commit is contained in:
parent
a4b573d750
commit
c014571258
12 changed files with 153 additions and 20 deletions
51
docs/_static/llama-stack-spec.html
vendored
51
docs/_static/llama-stack-spec.html
vendored
|
|
@ -3096,11 +3096,18 @@
|
|||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "OK",
|
||||
"description": "Response from an OpenAI-compatible chat completion request. **OR** Chunk from a streaming response to an OpenAI-compatible chat completion request.",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/OpenAIChatCompletion"
|
||||
"oneOf": [
|
||||
{
|
||||
"$ref": "#/components/schemas/OpenAIChatCompletion"
|
||||
},
|
||||
{
|
||||
"$ref": "#/components/schemas/OpenAIChatCompletionChunk"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -9506,6 +9513,46 @@
|
|||
"title": "OpenAIChatCompletion",
|
||||
"description": "Response from an OpenAI-compatible chat completion request."
|
||||
},
|
||||
"OpenAIChatCompletionChunk": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string",
|
||||
"description": "The ID of the chat completion"
|
||||
},
|
||||
"choices": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/OpenAIChoice"
|
||||
},
|
||||
"description": "List of choices"
|
||||
},
|
||||
"object": {
|
||||
"type": "string",
|
||||
"const": "chat.completion.chunk",
|
||||
"default": "chat.completion.chunk",
|
||||
"description": "The object type, which will be \"chat.completion.chunk\""
|
||||
},
|
||||
"created": {
|
||||
"type": "integer",
|
||||
"description": "The Unix timestamp in seconds when the chat completion was created"
|
||||
},
|
||||
"model": {
|
||||
"type": "string",
|
||||
"description": "The model that was used to generate the chat completion"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"id",
|
||||
"choices",
|
||||
"object",
|
||||
"created",
|
||||
"model"
|
||||
],
|
||||
"title": "OpenAIChatCompletionChunk",
|
||||
"description": "Chunk from a streaming response to an OpenAI-compatible chat completion request."
|
||||
},
|
||||
"OpenAIChoice": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue