diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index c85eb549f..54d888441 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -3096,11 +3096,18 @@
"post": {
"responses": {
"200": {
- "description": "OK",
+ "description": "Response from an OpenAI-compatible chat completion request. **OR** Chunk from a streaming response to an OpenAI-compatible chat completion request.",
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/OpenAIChatCompletion"
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/OpenAIChatCompletion"
+ },
+ {
+ "$ref": "#/components/schemas/OpenAIChatCompletionChunk"
+ }
+ ]
}
}
}
@@ -8857,7 +8864,17 @@
"description": "Must be \"assistant\" to identify this as the model's response"
},
"content": {
- "$ref": "#/components/schemas/InterleavedContent",
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam"
+ }
+ }
+ ],
"description": "The content of the model's response"
},
"name": {
@@ -8867,9 +8884,9 @@
"tool_calls": {
"type": "array",
"items": {
- "$ref": "#/components/schemas/ToolCall"
+ "$ref": "#/components/schemas/OpenAIChatCompletionToolCall"
},
- "description": "List of tool calls. Each tool call is a ToolCall object."
+ "description": "List of tool calls. Each tool call is an OpenAIChatCompletionToolCall object."
}
},
"additionalProperties": false,
@@ -8880,6 +8897,98 @@
"title": "OpenAIAssistantMessageParam",
"description": "A message containing the model's (assistant) response in an OpenAI-compatible chat completion request."
},
+ "OpenAIChatCompletionContentPartImageParam": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "image_url",
+ "default": "image_url"
+ },
+ "image_url": {
+ "$ref": "#/components/schemas/OpenAIImageURL"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "image_url"
+ ],
+ "title": "OpenAIChatCompletionContentPartImageParam"
+ },
+ "OpenAIChatCompletionContentPartParam": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam"
+ },
+ {
+ "$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "text": "#/components/schemas/OpenAIChatCompletionContentPartTextParam",
+ "image_url": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
+ }
+ }
+ },
+ "OpenAIChatCompletionContentPartTextParam": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "text",
+ "default": "text"
+ },
+ "text": {
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "text"
+ ],
+ "title": "OpenAIChatCompletionContentPartTextParam"
+ },
+ "OpenAIChatCompletionToolCall": {
+ "type": "object",
+ "properties": {
+ "index": {
+ "type": "integer"
+ },
+ "id": {
+ "type": "string"
+ },
+ "type": {
+ "type": "string",
+ "const": "function",
+ "default": "function"
+ },
+ "function": {
+ "$ref": "#/components/schemas/OpenAIChatCompletionToolCallFunction"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "OpenAIChatCompletionToolCall"
+ },
+ "OpenAIChatCompletionToolCallFunction": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "type": "string"
+ },
+ "arguments": {
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "title": "OpenAIChatCompletionToolCallFunction"
+ },
"OpenAIDeveloperMessageParam": {
"type": "object",
"properties": {
@@ -8890,7 +8999,17 @@
"description": "Must be \"developer\" to identify this as a developer message"
},
"content": {
- "$ref": "#/components/schemas/InterleavedContent",
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam"
+ }
+ }
+ ],
"description": "The content of the developer message"
},
"name": {
@@ -8906,6 +9025,66 @@
"title": "OpenAIDeveloperMessageParam",
"description": "A message from the developer in an OpenAI-compatible chat completion request."
},
+ "OpenAIImageURL": {
+ "type": "object",
+ "properties": {
+ "url": {
+ "type": "string"
+ },
+ "detail": {
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "url"
+ ],
+ "title": "OpenAIImageURL"
+ },
+ "OpenAIJSONSchema": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "type": "string"
+ },
+ "description": {
+ "type": "string"
+ },
+ "strict": {
+ "type": "boolean"
+ },
+ "schema": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "name"
+ ],
+ "title": "OpenAIJSONSchema"
+ },
"OpenAIMessageParam": {
"oneOf": [
{
@@ -8935,6 +9114,76 @@
}
}
},
+ "OpenAIResponseFormatJSONObject": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "json_object",
+ "default": "json_object"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "OpenAIResponseFormatJSONObject"
+ },
+ "OpenAIResponseFormatJSONSchema": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "json_schema",
+ "default": "json_schema"
+ },
+ "json_schema": {
+ "$ref": "#/components/schemas/OpenAIJSONSchema"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "json_schema"
+ ],
+ "title": "OpenAIResponseFormatJSONSchema"
+ },
+ "OpenAIResponseFormatParam": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/OpenAIResponseFormatText"
+ },
+ {
+ "$ref": "#/components/schemas/OpenAIResponseFormatJSONSchema"
+ },
+ {
+ "$ref": "#/components/schemas/OpenAIResponseFormatJSONObject"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "text": "#/components/schemas/OpenAIResponseFormatText",
+ "json_schema": "#/components/schemas/OpenAIResponseFormatJSONSchema",
+ "json_object": "#/components/schemas/OpenAIResponseFormatJSONObject"
+ }
+ }
+ },
+ "OpenAIResponseFormatText": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "text",
+ "default": "text"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "OpenAIResponseFormatText"
+ },
"OpenAISystemMessageParam": {
"type": "object",
"properties": {
@@ -8945,7 +9194,17 @@
"description": "Must be \"system\" to identify this as a system message"
},
"content": {
- "$ref": "#/components/schemas/InterleavedContent",
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam"
+ }
+ }
+ ],
"description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)."
},
"name": {
@@ -8975,7 +9234,17 @@
"description": "Unique identifier for the tool call this response is for"
},
"content": {
- "$ref": "#/components/schemas/InterleavedContent",
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam"
+ }
+ }
+ ],
"description": "The response content from the tool"
}
},
@@ -8998,7 +9267,17 @@
"description": "Must be \"user\" to identify this as a user message"
},
"content": {
- "$ref": "#/components/schemas/InterleavedContent",
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam"
+ }
+ }
+ ],
"description": "The content of the message, which can include text and other media"
},
"name": {
@@ -9126,10 +9405,7 @@
"description": "(Optional) The penalty for repeated tokens"
},
"response_format": {
- "type": "object",
- "additionalProperties": {
- "type": "string"
- },
+ "$ref": "#/components/schemas/OpenAIResponseFormatParam",
"description": "(Optional) The response format to use"
},
"seed": {
@@ -9306,6 +9582,46 @@
"title": "OpenAIChatCompletion",
"description": "Response from an OpenAI-compatible chat completion request."
},
+ "OpenAIChatCompletionChunk": {
+ "type": "object",
+ "properties": {
+ "id": {
+ "type": "string",
+ "description": "The ID of the chat completion"
+ },
+ "choices": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/OpenAIChunkChoice"
+ },
+ "description": "List of choices"
+ },
+ "object": {
+ "type": "string",
+ "const": "chat.completion.chunk",
+ "default": "chat.completion.chunk",
+ "description": "The object type, which will be \"chat.completion.chunk\""
+ },
+ "created": {
+ "type": "integer",
+ "description": "The Unix timestamp in seconds when the chat completion was created"
+ },
+ "model": {
+ "type": "string",
+ "description": "The model that was used to generate the chat completion"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "id",
+ "choices",
+ "object",
+ "created",
+ "model"
+ ],
+ "title": "OpenAIChatCompletionChunk",
+ "description": "Chunk from a streaming response to an OpenAI-compatible chat completion request."
+ },
"OpenAIChoice": {
"type": "object",
"properties": {
@@ -9318,10 +9634,12 @@
"description": "The reason the model stopped generating"
},
"index": {
- "type": "integer"
+ "type": "integer",
+ "description": "The index of the choice"
},
"logprobs": {
- "$ref": "#/components/schemas/OpenAIChoiceLogprobs"
+ "$ref": "#/components/schemas/OpenAIChoiceLogprobs",
+ "description": "(Optional) The log probabilities for the tokens in the message"
}
},
"additionalProperties": false,
@@ -9333,6 +9651,33 @@
"title": "OpenAIChoice",
"description": "A choice from an OpenAI-compatible chat completion response."
},
+ "OpenAIChoiceDelta": {
+ "type": "object",
+ "properties": {
+ "content": {
+ "type": "string",
+ "description": "(Optional) The content of the delta"
+ },
+ "refusal": {
+ "type": "string",
+ "description": "(Optional) The refusal of the delta"
+ },
+ "role": {
+ "type": "string",
+ "description": "(Optional) The role of the delta"
+ },
+ "tool_calls": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/OpenAIChatCompletionToolCall"
+ },
+ "description": "(Optional) The tool calls of the delta"
+ }
+ },
+ "additionalProperties": false,
+ "title": "OpenAIChoiceDelta",
+ "description": "A delta from an OpenAI-compatible chat completion streaming response."
+ },
"OpenAIChoiceLogprobs": {
"type": "object",
"properties": {
@@ -9340,19 +9685,50 @@
"type": "array",
"items": {
"$ref": "#/components/schemas/OpenAITokenLogProb"
- }
+ },
+ "description": "(Optional) The log probabilities for the tokens in the message"
},
"refusal": {
"type": "array",
"items": {
"$ref": "#/components/schemas/OpenAITokenLogProb"
- }
+ },
+ "description": "(Optional) The log probabilities for the tokens in the message"
}
},
"additionalProperties": false,
"title": "OpenAIChoiceLogprobs",
"description": "The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response."
},
+ "OpenAIChunkChoice": {
+ "type": "object",
+ "properties": {
+ "delta": {
+ "$ref": "#/components/schemas/OpenAIChoiceDelta",
+ "description": "The delta from the chunk"
+ },
+ "finish_reason": {
+ "type": "string",
+ "description": "The reason the model stopped generating"
+ },
+ "index": {
+ "type": "integer",
+ "description": "The index of the choice"
+ },
+ "logprobs": {
+ "$ref": "#/components/schemas/OpenAIChoiceLogprobs",
+ "description": "(Optional) The log probabilities for the tokens in the message"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "delta",
+ "finish_reason",
+ "index"
+ ],
+ "title": "OpenAIChunkChoice",
+ "description": "A chunk choice from an OpenAI-compatible chat completion streaming response."
+ },
"OpenAITokenLogProb": {
"type": "object",
"properties": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 6c99c9155..cf657bff9 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -2135,11 +2135,15 @@ paths:
post:
responses:
'200':
- description: OK
+ description: >-
+ Response from an OpenAI-compatible chat completion request. **OR** Chunk
+ from a streaming response to an OpenAI-compatible chat completion request.
content:
application/json:
schema:
- $ref: '#/components/schemas/OpenAIChatCompletion'
+ oneOf:
+ - $ref: '#/components/schemas/OpenAIChatCompletion'
+ - $ref: '#/components/schemas/OpenAIChatCompletionChunk'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
@@ -6073,7 +6077,11 @@ components:
description: >-
Must be "assistant" to identify this as the model's response
content:
- $ref: '#/components/schemas/InterleavedContent'
+ oneOf:
+ - type: string
+ - type: array
+ items:
+ $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
description: The content of the model's response
name:
type: string
@@ -6082,9 +6090,10 @@ components:
tool_calls:
type: array
items:
- $ref: '#/components/schemas/ToolCall'
+ $ref: '#/components/schemas/OpenAIChatCompletionToolCall'
description: >-
- List of tool calls. Each tool call is a ToolCall object.
+ List of tool calls. Each tool call is an OpenAIChatCompletionToolCall
+ object.
additionalProperties: false
required:
- role
@@ -6093,6 +6102,70 @@ components:
description: >-
A message containing the model's (assistant) response in an OpenAI-compatible
chat completion request.
+ "OpenAIChatCompletionContentPartImageParam":
+ type: object
+ properties:
+ type:
+ type: string
+ const: image_url
+ default: image_url
+ image_url:
+ $ref: '#/components/schemas/OpenAIImageURL'
+ additionalProperties: false
+ required:
+ - type
+ - image_url
+ title: >-
+ OpenAIChatCompletionContentPartImageParam
+ OpenAIChatCompletionContentPartParam:
+ oneOf:
+ - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
+ - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
+ discriminator:
+ propertyName: type
+ mapping:
+ text: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
+ image_url: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
+ OpenAIChatCompletionContentPartTextParam:
+ type: object
+ properties:
+ type:
+ type: string
+ const: text
+ default: text
+ text:
+ type: string
+ additionalProperties: false
+ required:
+ - type
+ - text
+ title: OpenAIChatCompletionContentPartTextParam
+ OpenAIChatCompletionToolCall:
+ type: object
+ properties:
+ index:
+ type: integer
+ id:
+ type: string
+ type:
+ type: string
+ const: function
+ default: function
+ function:
+ $ref: '#/components/schemas/OpenAIChatCompletionToolCallFunction'
+ additionalProperties: false
+ required:
+ - type
+ title: OpenAIChatCompletionToolCall
+ OpenAIChatCompletionToolCallFunction:
+ type: object
+ properties:
+ name:
+ type: string
+ arguments:
+ type: string
+ additionalProperties: false
+ title: OpenAIChatCompletionToolCallFunction
OpenAIDeveloperMessageParam:
type: object
properties:
@@ -6103,7 +6176,11 @@ components:
description: >-
Must be "developer" to identify this as a developer message
content:
- $ref: '#/components/schemas/InterleavedContent'
+ oneOf:
+ - type: string
+ - type: array
+ items:
+ $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
description: The content of the developer message
name:
type: string
@@ -6116,6 +6193,40 @@ components:
title: OpenAIDeveloperMessageParam
description: >-
A message from the developer in an OpenAI-compatible chat completion request.
+ OpenAIImageURL:
+ type: object
+ properties:
+ url:
+ type: string
+ detail:
+ type: string
+ additionalProperties: false
+ required:
+ - url
+ title: OpenAIImageURL
+ OpenAIJSONSchema:
+ type: object
+ properties:
+ name:
+ type: string
+ description:
+ type: string
+ strict:
+ type: boolean
+ schema:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - name
+ title: OpenAIJSONSchema
OpenAIMessageParam:
oneOf:
- $ref: '#/components/schemas/OpenAIUserMessageParam'
@@ -6131,6 +6242,53 @@ components:
assistant: '#/components/schemas/OpenAIAssistantMessageParam'
tool: '#/components/schemas/OpenAIToolMessageParam'
developer: '#/components/schemas/OpenAIDeveloperMessageParam'
+ OpenAIResponseFormatJSONObject:
+ type: object
+ properties:
+ type:
+ type: string
+ const: json_object
+ default: json_object
+ additionalProperties: false
+ required:
+ - type
+ title: OpenAIResponseFormatJSONObject
+ OpenAIResponseFormatJSONSchema:
+ type: object
+ properties:
+ type:
+ type: string
+ const: json_schema
+ default: json_schema
+ json_schema:
+ $ref: '#/components/schemas/OpenAIJSONSchema'
+ additionalProperties: false
+ required:
+ - type
+ - json_schema
+ title: OpenAIResponseFormatJSONSchema
+ OpenAIResponseFormatParam:
+ oneOf:
+ - $ref: '#/components/schemas/OpenAIResponseFormatText'
+ - $ref: '#/components/schemas/OpenAIResponseFormatJSONSchema'
+ - $ref: '#/components/schemas/OpenAIResponseFormatJSONObject'
+ discriminator:
+ propertyName: type
+ mapping:
+ text: '#/components/schemas/OpenAIResponseFormatText'
+ json_schema: '#/components/schemas/OpenAIResponseFormatJSONSchema'
+ json_object: '#/components/schemas/OpenAIResponseFormatJSONObject'
+ OpenAIResponseFormatText:
+ type: object
+ properties:
+ type:
+ type: string
+ const: text
+ default: text
+ additionalProperties: false
+ required:
+ - type
+ title: OpenAIResponseFormatText
OpenAISystemMessageParam:
type: object
properties:
@@ -6141,7 +6299,11 @@ components:
description: >-
Must be "system" to identify this as a system message
content:
- $ref: '#/components/schemas/InterleavedContent'
+ oneOf:
+ - type: string
+ - type: array
+ items:
+ $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
description: >-
The content of the "system prompt". If multiple system messages are provided,
they are concatenated. The underlying Llama Stack code may also add other
@@ -6171,7 +6333,11 @@ components:
description: >-
Unique identifier for the tool call this response is for
content:
- $ref: '#/components/schemas/InterleavedContent'
+ oneOf:
+ - type: string
+ - type: array
+ items:
+ $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
description: The response content from the tool
additionalProperties: false
required:
@@ -6192,7 +6358,11 @@ components:
description: >-
Must be "user" to identify this as a user message
content:
- $ref: '#/components/schemas/InterleavedContent'
+ oneOf:
+ - type: string
+ - type: array
+ items:
+ $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
description: >-
The content of the message, which can include text and other media
name:
@@ -6278,9 +6448,7 @@ components:
description: >-
(Optional) The penalty for repeated tokens
response_format:
- type: object
- additionalProperties:
- type: string
+ $ref: '#/components/schemas/OpenAIResponseFormatParam'
description: (Optional) The response format to use
seed:
type: integer
@@ -6386,6 +6554,41 @@ components:
title: OpenAIChatCompletion
description: >-
Response from an OpenAI-compatible chat completion request.
+ OpenAIChatCompletionChunk:
+ type: object
+ properties:
+ id:
+ type: string
+ description: The ID of the chat completion
+ choices:
+ type: array
+ items:
+ $ref: '#/components/schemas/OpenAIChunkChoice'
+ description: List of choices
+ object:
+ type: string
+ const: chat.completion.chunk
+ default: chat.completion.chunk
+ description: >-
+ The object type, which will be "chat.completion.chunk"
+ created:
+ type: integer
+ description: >-
+ The Unix timestamp in seconds when the chat completion was created
+ model:
+ type: string
+ description: >-
+ The model that was used to generate the chat completion
+ additionalProperties: false
+ required:
+ - id
+ - choices
+ - object
+ - created
+ - model
+ title: OpenAIChatCompletionChunk
+ description: >-
+ Chunk from a streaming response to an OpenAI-compatible chat completion request.
OpenAIChoice:
type: object
properties:
@@ -6397,8 +6600,11 @@ components:
description: The reason the model stopped generating
index:
type: integer
+ description: The index of the choice
logprobs:
$ref: '#/components/schemas/OpenAIChoiceLogprobs'
+ description: >-
+ (Optional) The log probabilities for the tokens in the message
additionalProperties: false
required:
- message
@@ -6407,6 +6613,27 @@ components:
title: OpenAIChoice
description: >-
A choice from an OpenAI-compatible chat completion response.
+ OpenAIChoiceDelta:
+ type: object
+ properties:
+ content:
+ type: string
+ description: (Optional) The content of the delta
+ refusal:
+ type: string
+ description: (Optional) The refusal of the delta
+ role:
+ type: string
+ description: (Optional) The role of the delta
+ tool_calls:
+ type: array
+ items:
+ $ref: '#/components/schemas/OpenAIChatCompletionToolCall'
+ description: (Optional) The tool calls of the delta
+ additionalProperties: false
+ title: OpenAIChoiceDelta
+ description: >-
+ A delta from an OpenAI-compatible chat completion streaming response.
OpenAIChoiceLogprobs:
type: object
properties:
@@ -6414,15 +6641,43 @@ components:
type: array
items:
$ref: '#/components/schemas/OpenAITokenLogProb'
+ description: >-
+ (Optional) The log probabilities for the tokens in the message
refusal:
type: array
items:
$ref: '#/components/schemas/OpenAITokenLogProb'
+ description: >-
+ (Optional) The log probabilities for the tokens in the message
additionalProperties: false
title: OpenAIChoiceLogprobs
description: >-
The log probabilities for the tokens in the message from an OpenAI-compatible
chat completion response.
+ OpenAIChunkChoice:
+ type: object
+ properties:
+ delta:
+ $ref: '#/components/schemas/OpenAIChoiceDelta'
+ description: The delta from the chunk
+ finish_reason:
+ type: string
+ description: The reason the model stopped generating
+ index:
+ type: integer
+ description: The index of the choice
+ logprobs:
+ $ref: '#/components/schemas/OpenAIChoiceLogprobs'
+ description: >-
+ (Optional) The log probabilities for the tokens in the message
+ additionalProperties: false
+ required:
+ - delta
+ - finish_reason
+ - index
+ title: OpenAIChunkChoice
+ description: >-
+ A chunk choice from an OpenAI-compatible chat completion streaming response.
OpenAITokenLogProb:
type: object
properties:
diff --git a/docs/source/distributions/self_hosted_distro/groq.md b/docs/source/distributions/self_hosted_distro/groq.md
index 4f5a8a859..b18be1b2f 100644
--- a/docs/source/distributions/self_hosted_distro/groq.md
+++ b/docs/source/distributions/self_hosted_distro/groq.md
@@ -43,7 +43,9 @@ The following models are available by default:
- `groq/llama-3.3-70b-versatile (aliases: meta-llama/Llama-3.3-70B-Instruct)`
- `groq/llama-3.2-3b-preview (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `groq/llama-4-scout-17b-16e-instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
+- `groq/meta-llama/llama-4-scout-17b-16e-instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
- `groq/llama-4-maverick-17b-128e-instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)`
+- `groq/meta-llama/llama-4-maverick-17b-128e-instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)`
### Prerequisite: API Keys
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 21753ca23..596efb136 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -18,7 +18,7 @@ from typing import (
)
from pydantic import BaseModel, Field, field_validator
-from typing_extensions import Annotated
+from typing_extensions import Annotated, TypedDict
from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent, InterleavedContentItem
from llama_stack.apis.models import Model
@@ -442,6 +442,37 @@ class EmbeddingsResponse(BaseModel):
embeddings: List[List[float]]
+@json_schema_type
+class OpenAIChatCompletionContentPartTextParam(BaseModel):
+ type: Literal["text"] = "text"
+ text: str
+
+
+@json_schema_type
+class OpenAIImageURL(BaseModel):
+ url: str
+ detail: Optional[str] = None
+
+
+@json_schema_type
+class OpenAIChatCompletionContentPartImageParam(BaseModel):
+ type: Literal["image_url"] = "image_url"
+ image_url: OpenAIImageURL
+
+
+OpenAIChatCompletionContentPartParam = Annotated[
+ Union[
+ OpenAIChatCompletionContentPartTextParam,
+ OpenAIChatCompletionContentPartImageParam,
+ ],
+ Field(discriminator="type"),
+]
+register_schema(OpenAIChatCompletionContentPartParam, name="OpenAIChatCompletionContentPartParam")
+
+
+OpenAIChatCompletionMessageContent = Union[str, List[OpenAIChatCompletionContentPartParam]]
+
+
@json_schema_type
class OpenAIUserMessageParam(BaseModel):
"""A message from the user in an OpenAI-compatible chat completion request.
@@ -452,7 +483,7 @@ class OpenAIUserMessageParam(BaseModel):
"""
role: Literal["user"] = "user"
- content: InterleavedContent
+ content: OpenAIChatCompletionMessageContent
name: Optional[str] = None
@@ -466,10 +497,24 @@ class OpenAISystemMessageParam(BaseModel):
"""
role: Literal["system"] = "system"
- content: InterleavedContent
+ content: OpenAIChatCompletionMessageContent
name: Optional[str] = None
+@json_schema_type
+class OpenAIChatCompletionToolCallFunction(BaseModel):
+ name: Optional[str] = None
+ arguments: Optional[str] = None
+
+
+@json_schema_type
+class OpenAIChatCompletionToolCall(BaseModel):
+ index: Optional[int] = None
+ id: Optional[str] = None
+ type: Literal["function"] = "function"
+ function: Optional[OpenAIChatCompletionToolCallFunction] = None
+
+
@json_schema_type
class OpenAIAssistantMessageParam(BaseModel):
"""A message containing the model's (assistant) response in an OpenAI-compatible chat completion request.
@@ -477,13 +522,13 @@ class OpenAIAssistantMessageParam(BaseModel):
:param role: Must be "assistant" to identify this as the model's response
:param content: The content of the model's response
:param name: (Optional) The name of the assistant message participant.
- :param tool_calls: List of tool calls. Each tool call is a ToolCall object.
+ :param tool_calls: List of tool calls. Each tool call is an OpenAIChatCompletionToolCall object.
"""
role: Literal["assistant"] = "assistant"
- content: InterleavedContent
+ content: OpenAIChatCompletionMessageContent
name: Optional[str] = None
- tool_calls: Optional[List[ToolCall]] = Field(default_factory=list)
+ tool_calls: Optional[List[OpenAIChatCompletionToolCall]] = Field(default_factory=list)
@json_schema_type
@@ -497,7 +542,7 @@ class OpenAIToolMessageParam(BaseModel):
role: Literal["tool"] = "tool"
tool_call_id: str
- content: InterleavedContent
+ content: OpenAIChatCompletionMessageContent
@json_schema_type
@@ -510,7 +555,7 @@ class OpenAIDeveloperMessageParam(BaseModel):
"""
role: Literal["developer"] = "developer"
- content: InterleavedContent
+ content: OpenAIChatCompletionMessageContent
name: Optional[str] = None
@@ -527,6 +572,46 @@ OpenAIMessageParam = Annotated[
register_schema(OpenAIMessageParam, name="OpenAIMessageParam")
+@json_schema_type
+class OpenAIResponseFormatText(BaseModel):
+ type: Literal["text"] = "text"
+
+
+@json_schema_type
+class OpenAIJSONSchema(TypedDict, total=False):
+ name: str
+ description: Optional[str] = None
+ strict: Optional[bool] = None
+
+ # Pydantic BaseModel cannot be used with a schema param, since it already
+ # has one. And, we don't want to alias here because then have to handle
+ # that alias when converting to OpenAI params. So, to support schema,
+ # we use a TypedDict.
+ schema: Optional[Dict[str, Any]] = None
+
+
+@json_schema_type
+class OpenAIResponseFormatJSONSchema(BaseModel):
+ type: Literal["json_schema"] = "json_schema"
+ json_schema: OpenAIJSONSchema
+
+
+@json_schema_type
+class OpenAIResponseFormatJSONObject(BaseModel):
+ type: Literal["json_object"] = "json_object"
+
+
+OpenAIResponseFormatParam = Annotated[
+ Union[
+ OpenAIResponseFormatText,
+ OpenAIResponseFormatJSONSchema,
+ OpenAIResponseFormatJSONObject,
+ ],
+ Field(discriminator="type"),
+]
+register_schema(OpenAIResponseFormatParam, name="OpenAIResponseFormatParam")
+
+
@json_schema_type
class OpenAITopLogProb(BaseModel):
"""The top log probability for a token from an OpenAI-compatible chat completion response.
@@ -561,22 +646,54 @@ class OpenAITokenLogProb(BaseModel):
class OpenAIChoiceLogprobs(BaseModel):
"""The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response.
- :content: (Optional) The log probabilities for the tokens in the message
- :refusal: (Optional) The log probabilities for the tokens in the message
+ :param content: (Optional) The log probabilities for the tokens in the message
+ :param refusal: (Optional) The log probabilities for the tokens in the message
"""
content: Optional[List[OpenAITokenLogProb]] = None
refusal: Optional[List[OpenAITokenLogProb]] = None
+@json_schema_type
+class OpenAIChoiceDelta(BaseModel):
+ """A delta from an OpenAI-compatible chat completion streaming response.
+
+ :param content: (Optional) The content of the delta
+ :param refusal: (Optional) The refusal of the delta
+ :param role: (Optional) The role of the delta
+ :param tool_calls: (Optional) The tool calls of the delta
+ """
+
+ content: Optional[str] = None
+ refusal: Optional[str] = None
+ role: Optional[str] = None
+ tool_calls: Optional[List[OpenAIChatCompletionToolCall]] = None
+
+
+@json_schema_type
+class OpenAIChunkChoice(BaseModel):
+ """A chunk choice from an OpenAI-compatible chat completion streaming response.
+
+ :param delta: The delta from the chunk
+ :param finish_reason: The reason the model stopped generating
+ :param index: The index of the choice
+ :param logprobs: (Optional) The log probabilities for the tokens in the message
+ """
+
+ delta: OpenAIChoiceDelta
+ finish_reason: str
+ index: int
+ logprobs: Optional[OpenAIChoiceLogprobs] = None
+
+
@json_schema_type
class OpenAIChoice(BaseModel):
"""A choice from an OpenAI-compatible chat completion response.
:param message: The message from the model
:param finish_reason: The reason the model stopped generating
- :index: The index of the choice
- :logprobs: (Optional) The log probabilities for the tokens in the message
+ :param index: The index of the choice
+ :param logprobs: (Optional) The log probabilities for the tokens in the message
"""
message: OpenAIMessageParam
@@ -603,6 +720,24 @@ class OpenAIChatCompletion(BaseModel):
model: str
+@json_schema_type
+class OpenAIChatCompletionChunk(BaseModel):
+ """Chunk from a streaming response to an OpenAI-compatible chat completion request.
+
+ :param id: The ID of the chat completion
+ :param choices: List of choices
+ :param object: The object type, which will be "chat.completion.chunk"
+ :param created: The Unix timestamp in seconds when the chat completion was created
+ :param model: The model that was used to generate the chat completion
+ """
+
+ id: str
+ choices: List[OpenAIChunkChoice]
+ object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
+ created: int
+ model: str
+
+
@json_schema_type
class OpenAICompletionLogprobs(BaseModel):
"""The log probabilities for the tokens in the message from an OpenAI-compatible completion response.
@@ -872,7 +1007,7 @@ class Inference(Protocol):
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
- response_format: Optional[Dict[str, str]] = None,
+ response_format: Optional[OpenAIResponseFormatParam] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
@@ -883,7 +1018,7 @@ class Inference(Protocol):
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
- ) -> OpenAIChatCompletion:
+ ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
"""Generate an OpenAI-compatible chat completion for the given messages using the specified model.
:param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index cdf91e052..17aecdaf8 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -38,7 +38,13 @@ from llama_stack.apis.inference import (
ToolDefinition,
ToolPromptFormat,
)
-from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
+from llama_stack.apis.inference.inference import (
+ OpenAIChatCompletion,
+ OpenAIChatCompletionChunk,
+ OpenAICompletion,
+ OpenAIMessageParam,
+ OpenAIResponseFormatParam,
+)
from llama_stack.apis.models import Model, ModelType
from llama_stack.apis.safety import RunShieldResponse, Safety
from llama_stack.apis.scoring import (
@@ -531,7 +537,7 @@ class InferenceRouter(Inference):
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
- response_format: Optional[Dict[str, str]] = None,
+ response_format: Optional[OpenAIResponseFormatParam] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
@@ -542,7 +548,7 @@ class InferenceRouter(Inference):
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
- ) -> OpenAIChatCompletion:
+ ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
logger.debug(
f"InferenceRouter.openai_chat_completion: {model=}, {stream=}, {messages=}",
)
diff --git a/llama_stack/models/llama/llama3/tool_utils.py b/llama_stack/models/llama/llama3/tool_utils.py
index ef39ba0a5..91b46ec98 100644
--- a/llama_stack/models/llama/llama3/tool_utils.py
+++ b/llama_stack/models/llama/llama3/tool_utils.py
@@ -204,7 +204,9 @@ class ToolUtils:
return None
elif is_json(message_body):
response = json.loads(message_body)
- if ("type" in response and response["type"] == "function") or ("name" in response):
+ if ("type" in response and response["type"] == "function") or (
+ "name" in response and "parameters" in response
+ ):
function_name = response["name"]
args = response["parameters"]
return function_name, args
diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py
index 0b56ba1f7..2b9a27982 100644
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@@ -59,8 +59,8 @@ from llama_stack.providers.utils.inference.model_registry import (
build_hf_repo_model_entry,
)
from llama_stack.providers.utils.inference.openai_compat import (
- OpenAIChatCompletionUnsupportedMixin,
- OpenAICompletionUnsupportedMixin,
+ OpenAIChatCompletionToLlamaStackMixin,
+ OpenAICompletionToLlamaStackMixin,
)
from llama_stack.providers.utils.inference.prompt_adapter import (
augment_content_with_response_format_prompt,
@@ -83,8 +83,8 @@ def llama_builder_fn(config: MetaReferenceInferenceConfig, model_id: str, llama_
class MetaReferenceInferenceImpl(
- OpenAICompletionUnsupportedMixin,
- OpenAIChatCompletionUnsupportedMixin,
+ OpenAICompletionToLlamaStackMixin,
+ OpenAIChatCompletionToLlamaStackMixin,
SentenceTransformerEmbeddingMixin,
Inference,
ModelsProtocolPrivate,
diff --git a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
index 5bc20e3c2..d717d055f 100644
--- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@@ -25,8 +25,8 @@ from llama_stack.providers.utils.inference.embedding_mixin import (
SentenceTransformerEmbeddingMixin,
)
from llama_stack.providers.utils.inference.openai_compat import (
- OpenAIChatCompletionUnsupportedMixin,
- OpenAICompletionUnsupportedMixin,
+ OpenAIChatCompletionToLlamaStackMixin,
+ OpenAICompletionToLlamaStackMixin,
)
from .config import SentenceTransformersInferenceConfig
@@ -35,8 +35,8 @@ log = logging.getLogger(__name__)
class SentenceTransformersInferenceImpl(
- OpenAIChatCompletionUnsupportedMixin,
- OpenAICompletionUnsupportedMixin,
+ OpenAIChatCompletionToLlamaStackMixin,
+ OpenAICompletionToLlamaStackMixin,
SentenceTransformerEmbeddingMixin,
Inference,
ModelsProtocolPrivate,
diff --git a/llama_stack/providers/inline/inference/vllm/vllm.py b/llama_stack/providers/inline/inference/vllm/vllm.py
index 085c79d6b..9d742c39c 100644
--- a/llama_stack/providers/inline/inference/vllm/vllm.py
+++ b/llama_stack/providers/inline/inference/vllm/vllm.py
@@ -66,10 +66,10 @@ from llama_stack.providers.utils.inference.model_registry import (
ModelsProtocolPrivate,
)
from llama_stack.providers.utils.inference.openai_compat import (
- OpenAIChatCompletionUnsupportedMixin,
+ OpenAIChatCompletionToLlamaStackMixin,
OpenAICompatCompletionChoice,
OpenAICompatCompletionResponse,
- OpenAICompletionUnsupportedMixin,
+ OpenAICompletionToLlamaStackMixin,
get_stop_reason,
process_chat_completion_stream_response,
)
@@ -176,8 +176,8 @@ def _convert_sampling_params(
class VLLMInferenceImpl(
Inference,
- OpenAIChatCompletionUnsupportedMixin,
- OpenAICompletionUnsupportedMixin,
+ OpenAIChatCompletionToLlamaStackMixin,
+ OpenAICompletionToLlamaStackMixin,
ModelsProtocolPrivate,
):
"""
diff --git a/llama_stack/providers/remote/inference/bedrock/bedrock.py b/llama_stack/providers/remote/inference/bedrock/bedrock.py
index 0a485da8f..f8dbcf31a 100644
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@@ -36,10 +36,10 @@ from llama_stack.providers.utils.inference.model_registry import (
ModelRegistryHelper,
)
from llama_stack.providers.utils.inference.openai_compat import (
- OpenAIChatCompletionUnsupportedMixin,
+ OpenAIChatCompletionToLlamaStackMixin,
OpenAICompatCompletionChoice,
OpenAICompatCompletionResponse,
- OpenAICompletionUnsupportedMixin,
+ OpenAICompletionToLlamaStackMixin,
get_sampling_strategy_options,
process_chat_completion_response,
process_chat_completion_stream_response,
@@ -56,8 +56,8 @@ from .models import MODEL_ENTRIES
class BedrockInferenceAdapter(
ModelRegistryHelper,
Inference,
- OpenAIChatCompletionUnsupportedMixin,
- OpenAICompletionUnsupportedMixin,
+ OpenAIChatCompletionToLlamaStackMixin,
+ OpenAICompletionToLlamaStackMixin,
):
def __init__(self, config: BedrockConfig) -> None:
ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
diff --git a/llama_stack/providers/remote/inference/cerebras/cerebras.py b/llama_stack/providers/remote/inference/cerebras/cerebras.py
index 5e0a5b484..3156601be 100644
--- a/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py
@@ -34,8 +34,8 @@ from llama_stack.providers.utils.inference.model_registry import (
ModelRegistryHelper,
)
from llama_stack.providers.utils.inference.openai_compat import (
- OpenAIChatCompletionUnsupportedMixin,
- OpenAICompletionUnsupportedMixin,
+ OpenAIChatCompletionToLlamaStackMixin,
+ OpenAICompletionToLlamaStackMixin,
get_sampling_options,
process_chat_completion_response,
process_chat_completion_stream_response,
@@ -54,8 +54,8 @@ from .models import MODEL_ENTRIES
class CerebrasInferenceAdapter(
ModelRegistryHelper,
Inference,
- OpenAIChatCompletionUnsupportedMixin,
- OpenAICompletionUnsupportedMixin,
+ OpenAIChatCompletionToLlamaStackMixin,
+ OpenAICompletionToLlamaStackMixin,
):
def __init__(self, config: CerebrasImplConfig) -> None:
ModelRegistryHelper.__init__(
diff --git a/llama_stack/providers/remote/inference/databricks/databricks.py b/llama_stack/providers/remote/inference/databricks/databricks.py
index a10878b27..27d96eb7d 100644
--- a/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/llama_stack/providers/remote/inference/databricks/databricks.py
@@ -34,8 +34,8 @@ from llama_stack.providers.utils.inference.model_registry import (
build_hf_repo_model_entry,
)
from llama_stack.providers.utils.inference.openai_compat import (
- OpenAIChatCompletionUnsupportedMixin,
- OpenAICompletionUnsupportedMixin,
+ OpenAIChatCompletionToLlamaStackMixin,
+ OpenAICompletionToLlamaStackMixin,
get_sampling_options,
process_chat_completion_response,
process_chat_completion_stream_response,
@@ -61,8 +61,8 @@ model_entries = [
class DatabricksInferenceAdapter(
ModelRegistryHelper,
Inference,
- OpenAIChatCompletionUnsupportedMixin,
- OpenAICompletionUnsupportedMixin,
+ OpenAIChatCompletionToLlamaStackMixin,
+ OpenAICompletionToLlamaStackMixin,
):
def __init__(self, config: DatabricksImplConfig) -> None:
ModelRegistryHelper.__init__(self, model_entries=model_entries)
diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py
index b59e9f2cb..48c163c87 100644
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@@ -4,7 +4,7 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
-from typing import Any, AsyncGenerator, Dict, List, Optional, Union
+from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
from fireworks.client import Fireworks
from openai import AsyncOpenAI
@@ -32,13 +32,20 @@ from llama_stack.apis.inference import (
ToolDefinition,
ToolPromptFormat,
)
-from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
+from llama_stack.apis.inference.inference import (
+ OpenAIChatCompletion,
+ OpenAIChatCompletionChunk,
+ OpenAICompletion,
+ OpenAIMessageParam,
+ OpenAIResponseFormatParam,
+)
from llama_stack.distribution.request_headers import NeedsRequestProviderData
from llama_stack.log import get_logger
from llama_stack.providers.utils.inference.model_registry import (
ModelRegistryHelper,
)
from llama_stack.providers.utils.inference.openai_compat import (
+ OpenAIChatCompletionToLlamaStackMixin,
convert_message_to_openai_dict,
get_sampling_options,
prepare_openai_completion_params,
@@ -301,6 +308,11 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
prompt_logprobs: Optional[int] = None,
) -> OpenAICompletion:
model_obj = await self.model_store.get_model(model)
+
+ # Fireworks always prepends with BOS
+ if isinstance(prompt, str) and prompt.startswith("<|begin_of_text|>"):
+ prompt = prompt[len("<|begin_of_text|>") :]
+
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
prompt=prompt,
@@ -320,6 +332,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
top_p=top_p,
user=user,
)
+
return await self._get_openai_client().completions.create(**params)
async def openai_chat_completion(
@@ -336,7 +349,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
- response_format: Optional[Dict[str, str]] = None,
+ response_format: Optional[OpenAIResponseFormatParam] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
@@ -347,10 +360,9 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
- ) -> OpenAIChatCompletion:
+ ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
model_obj = await self.model_store.get_model(model)
params = await prepare_openai_completion_params(
- model=model_obj.provider_resource_id,
messages=messages,
frequency_penalty=frequency_penalty,
function_call=function_call,
@@ -374,4 +386,12 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
top_p=top_p,
user=user,
)
- return await self._get_openai_client().chat.completions.create(**params)
+
+ # Divert Llama Models through Llama Stack inference APIs because
+ # Fireworks chat completions OpenAI-compatible API does not support
+ # tool calls properly.
+ llama_model = self.get_llama_model(model_obj.provider_resource_id)
+ if llama_model:
+ return await OpenAIChatCompletionToLlamaStackMixin.openai_chat_completion(self, model=model, **params)
+
+ return await self._get_openai_client().chat.completions.create(model=model_obj.provider_resource_id, **params)
diff --git a/llama_stack/providers/remote/inference/groq/groq.py b/llama_stack/providers/remote/inference/groq/groq.py
index c8789434f..f3f14e9af 100644
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@@ -4,8 +4,24 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
+from typing import Any, AsyncIterator, Dict, List, Optional, Union
+
+from openai import AsyncOpenAI
+
+from llama_stack.apis.inference.inference import (
+ OpenAIChatCompletion,
+ OpenAIChatCompletionChunk,
+ OpenAIChoiceDelta,
+ OpenAIChunkChoice,
+ OpenAIMessageParam,
+ OpenAIResponseFormatParam,
+ OpenAISystemMessageParam,
+)
from llama_stack.providers.remote.inference.groq.config import GroqConfig
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
+from llama_stack.providers.utils.inference.openai_compat import (
+ prepare_openai_completion_params,
+)
from .models import MODEL_ENTRIES
@@ -21,9 +37,129 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin):
provider_data_api_key_field="groq_api_key",
)
self.config = config
+ self._openai_client = None
async def initialize(self):
await super().initialize()
async def shutdown(self):
await super().shutdown()
+ if self._openai_client:
+ await self._openai_client.close()
+ self._openai_client = None
+
+ def _get_openai_client(self) -> AsyncOpenAI:
+ if not self._openai_client:
+ self._openai_client = AsyncOpenAI(
+ base_url=f"{self.config.url}/openai/v1",
+ api_key=self.config.api_key,
+ )
+ return self._openai_client
+
+ async def openai_chat_completion(
+ self,
+ model: str,
+ messages: List[OpenAIMessageParam],
+ frequency_penalty: Optional[float] = None,
+ function_call: Optional[Union[str, Dict[str, Any]]] = None,
+ functions: Optional[List[Dict[str, Any]]] = None,
+ logit_bias: Optional[Dict[str, float]] = None,
+ logprobs: Optional[bool] = None,
+ max_completion_tokens: Optional[int] = None,
+ max_tokens: Optional[int] = None,
+ n: Optional[int] = None,
+ parallel_tool_calls: Optional[bool] = None,
+ presence_penalty: Optional[float] = None,
+ response_format: Optional[OpenAIResponseFormatParam] = None,
+ seed: Optional[int] = None,
+ stop: Optional[Union[str, List[str]]] = None,
+ stream: Optional[bool] = None,
+ stream_options: Optional[Dict[str, Any]] = None,
+ temperature: Optional[float] = None,
+ tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
+ tools: Optional[List[Dict[str, Any]]] = None,
+ top_logprobs: Optional[int] = None,
+ top_p: Optional[float] = None,
+ user: Optional[str] = None,
+ ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+ model_obj = await self.model_store.get_model(model)
+
+ # Groq does not support json_schema response format, so we need to convert it to json_object
+ if response_format and response_format.type == "json_schema":
+ response_format.type = "json_object"
+ schema = response_format.json_schema.get("schema", {})
+ response_format.json_schema = None
+ json_instructions = f"\nYour response should be a JSON object that matches the following schema: {schema}"
+ if messages and messages[0].role == "system":
+ messages[0].content = messages[0].content + json_instructions
+ else:
+ messages.insert(0, OpenAISystemMessageParam(content=json_instructions))
+
+ # Groq returns a 400 error if tools are provided but none are called
+ # So, set tool_choice to "required" to attempt to force a call
+ if tools and (not tool_choice or tool_choice == "auto"):
+ tool_choice = "required"
+
+ params = await prepare_openai_completion_params(
+ model=model_obj.provider_resource_id.replace("groq/", ""),
+ messages=messages,
+ frequency_penalty=frequency_penalty,
+ function_call=function_call,
+ functions=functions,
+ logit_bias=logit_bias,
+ logprobs=logprobs,
+ max_completion_tokens=max_completion_tokens,
+ max_tokens=max_tokens,
+ n=n,
+ parallel_tool_calls=parallel_tool_calls,
+ presence_penalty=presence_penalty,
+ response_format=response_format,
+ seed=seed,
+ stop=stop,
+ stream=stream,
+ stream_options=stream_options,
+ temperature=temperature,
+ tool_choice=tool_choice,
+ tools=tools,
+ top_logprobs=top_logprobs,
+ top_p=top_p,
+ user=user,
+ )
+
+ # Groq does not support streaming requests that set response_format
+ fake_stream = False
+ if stream and response_format:
+ params["stream"] = False
+ fake_stream = True
+
+ response = await self._get_openai_client().chat.completions.create(**params)
+
+ if fake_stream:
+ chunk_choices = []
+ for choice in response.choices:
+ delta = OpenAIChoiceDelta(
+ content=choice.message.content,
+ role=choice.message.role,
+ tool_calls=choice.message.tool_calls,
+ )
+ chunk_choice = OpenAIChunkChoice(
+ delta=delta,
+ finish_reason=choice.finish_reason,
+ index=choice.index,
+ logprobs=None,
+ )
+ chunk_choices.append(chunk_choice)
+ chunk = OpenAIChatCompletionChunk(
+ id=response.id,
+ choices=chunk_choices,
+ object="chat.completion.chunk",
+ created=response.created,
+ model=response.model,
+ )
+
+ async def _fake_stream_generator():
+ yield chunk
+
+ return _fake_stream_generator()
+ else:
+ return response
diff --git a/llama_stack/providers/remote/inference/groq/models.py b/llama_stack/providers/remote/inference/groq/models.py
index d0c10ca62..0b4b81cfe 100644
--- a/llama_stack/providers/remote/inference/groq/models.py
+++ b/llama_stack/providers/remote/inference/groq/models.py
@@ -39,8 +39,16 @@ MODEL_ENTRIES = [
"groq/llama-4-scout-17b-16e-instruct",
CoreModelId.llama4_scout_17b_16e_instruct.value,
),
+ build_hf_repo_model_entry(
+ "groq/meta-llama/llama-4-scout-17b-16e-instruct",
+ CoreModelId.llama4_scout_17b_16e_instruct.value,
+ ),
build_hf_repo_model_entry(
"groq/llama-4-maverick-17b-128e-instruct",
CoreModelId.llama4_maverick_17b_128e_instruct.value,
),
+ build_hf_repo_model_entry(
+ "groq/meta-llama/llama-4-maverick-17b-128e-instruct",
+ CoreModelId.llama4_maverick_17b_128e_instruct.value,
+ ),
]
diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py
index d6f717719..15f0e72a1 100644
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@@ -35,7 +35,13 @@ from llama_stack.apis.inference import (
ToolConfig,
ToolDefinition,
)
-from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
+from llama_stack.apis.inference.inference import (
+ OpenAIChatCompletion,
+ OpenAIChatCompletionChunk,
+ OpenAICompletion,
+ OpenAIMessageParam,
+ OpenAIResponseFormatParam,
+)
from llama_stack.models.llama.datatypes import ToolPromptFormat
from llama_stack.providers.utils.inference.model_registry import (
ModelRegistryHelper,
@@ -329,7 +335,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
- response_format: Optional[Dict[str, str]] = None,
+ response_format: Optional[OpenAIResponseFormatParam] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
@@ -340,7 +346,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
- ) -> OpenAIChatCompletion:
+ ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
provider_model_id = self.get_provider_model_id(model)
params = await prepare_openai_completion_params(
diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
index f84863385..804d7eab2 100644
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -5,7 +5,7 @@
# the root directory of this source tree.
-from typing import Any, AsyncGenerator, Dict, List, Optional, Union
+from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
import httpx
from ollama import AsyncClient
@@ -39,7 +39,13 @@ from llama_stack.apis.inference import (
ToolDefinition,
ToolPromptFormat,
)
-from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
+from llama_stack.apis.inference.inference import (
+ OpenAIChatCompletion,
+ OpenAIChatCompletionChunk,
+ OpenAICompletion,
+ OpenAIMessageParam,
+ OpenAIResponseFormatParam,
+)
from llama_stack.apis.models import Model, ModelType
from llama_stack.log import get_logger
from llama_stack.providers.datatypes import (
@@ -408,7 +414,7 @@ class OllamaInferenceAdapter(
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
- response_format: Optional[Dict[str, str]] = None,
+ response_format: Optional[OpenAIResponseFormatParam] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
@@ -419,7 +425,7 @@ class OllamaInferenceAdapter(
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
- ) -> OpenAIChatCompletion:
+ ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
model_obj = await self._get_model(model)
params = {
k: v
diff --git a/llama_stack/providers/remote/inference/passthrough/passthrough.py b/llama_stack/providers/remote/inference/passthrough/passthrough.py
index 0eb38c395..af05320b0 100644
--- a/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py
@@ -4,7 +4,7 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
-from typing import Any, AsyncGenerator, Dict, List, Optional, Union
+from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
from llama_stack_client import AsyncLlamaStackClient
@@ -26,7 +26,13 @@ from llama_stack.apis.inference import (
ToolDefinition,
ToolPromptFormat,
)
-from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
+from llama_stack.apis.inference.inference import (
+ OpenAIChatCompletion,
+ OpenAIChatCompletionChunk,
+ OpenAICompletion,
+ OpenAIMessageParam,
+ OpenAIResponseFormatParam,
+)
from llama_stack.apis.models import Model
from llama_stack.distribution.library_client import convert_pydantic_to_json_value, convert_to_pydantic
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
@@ -266,7 +272,7 @@ class PassthroughInferenceAdapter(Inference):
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
- response_format: Optional[Dict[str, str]] = None,
+ response_format: Optional[OpenAIResponseFormatParam] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
@@ -277,7 +283,7 @@ class PassthroughInferenceAdapter(Inference):
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
- ) -> OpenAIChatCompletion:
+ ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
client = self._get_client()
model_obj = await self.model_store.get_model(model)
diff --git a/llama_stack/providers/remote/inference/runpod/runpod.py b/llama_stack/providers/remote/inference/runpod/runpod.py
index 878460122..72cbead9b 100644
--- a/llama_stack/providers/remote/inference/runpod/runpod.py
+++ b/llama_stack/providers/remote/inference/runpod/runpod.py
@@ -12,8 +12,8 @@ from llama_stack.apis.inference import * # noqa: F403
# from llama_stack.providers.datatypes import ModelsProtocolPrivate
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
from llama_stack.providers.utils.inference.openai_compat import (
- OpenAIChatCompletionUnsupportedMixin,
- OpenAICompletionUnsupportedMixin,
+ OpenAIChatCompletionToLlamaStackMixin,
+ OpenAICompletionToLlamaStackMixin,
get_sampling_options,
process_chat_completion_response,
process_chat_completion_stream_response,
@@ -43,8 +43,8 @@ RUNPOD_SUPPORTED_MODELS = {
class RunpodInferenceAdapter(
ModelRegistryHelper,
Inference,
- OpenAIChatCompletionUnsupportedMixin,
- OpenAICompletionUnsupportedMixin,
+ OpenAIChatCompletionToLlamaStackMixin,
+ OpenAICompletionToLlamaStackMixin,
):
def __init__(self, config: RunpodImplConfig) -> None:
ModelRegistryHelper.__init__(self, stack_to_provider_models_map=RUNPOD_SUPPORTED_MODELS)
diff --git a/llama_stack/providers/remote/inference/sambanova/sambanova.py b/llama_stack/providers/remote/inference/sambanova/sambanova.py
index c503657eb..1665e72b8 100644
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@@ -42,8 +42,8 @@ from llama_stack.apis.inference import (
)
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
from llama_stack.providers.utils.inference.openai_compat import (
- OpenAIChatCompletionUnsupportedMixin,
- OpenAICompletionUnsupportedMixin,
+ OpenAIChatCompletionToLlamaStackMixin,
+ OpenAICompletionToLlamaStackMixin,
process_chat_completion_stream_response,
)
from llama_stack.providers.utils.inference.prompt_adapter import (
@@ -57,8 +57,8 @@ from .models import MODEL_ENTRIES
class SambaNovaInferenceAdapter(
ModelRegistryHelper,
Inference,
- OpenAIChatCompletionUnsupportedMixin,
- OpenAICompletionUnsupportedMixin,
+ OpenAIChatCompletionToLlamaStackMixin,
+ OpenAICompletionToLlamaStackMixin,
):
def __init__(self, config: SambaNovaImplConfig) -> None:
ModelRegistryHelper.__init__(self, model_entries=MODEL_ENTRIES)
diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py
index 8f5b5e3cc..4ee386a15 100644
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@@ -40,10 +40,10 @@ from llama_stack.providers.utils.inference.model_registry import (
build_hf_repo_model_entry,
)
from llama_stack.providers.utils.inference.openai_compat import (
- OpenAIChatCompletionUnsupportedMixin,
+ OpenAIChatCompletionToLlamaStackMixin,
OpenAICompatCompletionChoice,
OpenAICompatCompletionResponse,
- OpenAICompletionUnsupportedMixin,
+ OpenAICompletionToLlamaStackMixin,
get_sampling_options,
process_chat_completion_response,
process_chat_completion_stream_response,
@@ -73,8 +73,8 @@ def build_hf_repo_model_entries():
class _HfAdapter(
Inference,
- OpenAIChatCompletionUnsupportedMixin,
- OpenAICompletionUnsupportedMixin,
+ OpenAIChatCompletionToLlamaStackMixin,
+ OpenAICompletionToLlamaStackMixin,
ModelsProtocolPrivate,
):
client: AsyncInferenceClient
diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py
index 1615b8cd1..001e6aac4 100644
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@@ -4,7 +4,7 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
-from typing import Any, AsyncGenerator, Dict, List, Optional, Union
+from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
from openai import AsyncOpenAI
from together import AsyncTogether
@@ -31,7 +31,13 @@ from llama_stack.apis.inference import (
ToolDefinition,
ToolPromptFormat,
)
-from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
+from llama_stack.apis.inference.inference import (
+ OpenAIChatCompletion,
+ OpenAIChatCompletionChunk,
+ OpenAICompletion,
+ OpenAIMessageParam,
+ OpenAIResponseFormatParam,
+)
from llama_stack.distribution.request_headers import NeedsRequestProviderData
from llama_stack.log import get_logger
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
@@ -315,7 +321,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
- response_format: Optional[Dict[str, str]] = None,
+ response_format: Optional[OpenAIResponseFormatParam] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
@@ -326,7 +332,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
- ) -> OpenAIChatCompletion:
+ ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
model_obj = await self.model_store.get_model(model)
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
@@ -353,4 +359,26 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
top_p=top_p,
user=user,
)
+ if params.get("stream", True):
+ return self._stream_openai_chat_completion(params)
return await self._get_openai_client().chat.completions.create(**params) # type: ignore
+
+ async def _stream_openai_chat_completion(self, params: dict) -> AsyncGenerator:
+ # together.ai sometimes adds usage data to the stream, even if include_usage is False
+ # This causes an unexpected final chunk with empty choices array to be sent
+ # to clients that may not handle it gracefully.
+ include_usage = False
+ if params.get("stream_options", None):
+ include_usage = params["stream_options"].get("include_usage", False)
+ stream = await self._get_openai_client().chat.completions.create(**params)
+
+ seen_finish_reason = False
+ async for chunk in stream:
+ # Final usage chunk with no choices that the user didn't request, so discard
+ if not include_usage and seen_finish_reason and len(chunk.choices) == 0:
+ break
+ yield chunk
+ for choice in chunk.choices:
+ if choice.finish_reason:
+ seen_finish_reason = True
+ break
diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index 0044d2e75..2b9eae1e9 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -5,7 +5,7 @@
# the root directory of this source tree.
import json
import logging
-from typing import Any, AsyncGenerator, Dict, List, Optional, Union
+from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
import httpx
from openai import AsyncOpenAI
@@ -45,7 +45,12 @@ from llama_stack.apis.inference import (
ToolDefinition,
ToolPromptFormat,
)
-from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
+from llama_stack.apis.inference.inference import (
+ OpenAIChatCompletion,
+ OpenAICompletion,
+ OpenAIMessageParam,
+ OpenAIResponseFormatParam,
+)
from llama_stack.apis.models import Model, ModelType
from llama_stack.models.llama.datatypes import BuiltinTool, StopReason, ToolCall
from llama_stack.models.llama.sku_list import all_registered_models
@@ -487,7 +492,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
- response_format: Optional[Dict[str, str]] = None,
+ response_format: Optional[OpenAIResponseFormatParam] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
@@ -498,7 +503,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
- ) -> OpenAIChatCompletion:
+ ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
model_obj = await self._get_model(model)
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
index cd0f4ec67..efe7031f5 100644
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@@ -30,7 +30,13 @@ from llama_stack.apis.inference import (
ToolDefinition,
ToolPromptFormat,
)
-from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
+from llama_stack.apis.inference.inference import (
+ OpenAIChatCompletion,
+ OpenAIChatCompletionChunk,
+ OpenAICompletion,
+ OpenAIMessageParam,
+ OpenAIResponseFormatParam,
+)
from llama_stack.apis.models.models import Model
from llama_stack.distribution.request_headers import NeedsRequestProviderData
from llama_stack.log import get_logger
@@ -270,7 +276,7 @@ class LiteLLMOpenAIMixin(
guided_choice: Optional[List[str]] = None,
prompt_logprobs: Optional[int] = None,
) -> OpenAICompletion:
- model_obj = await self._get_model(model)
+ model_obj = await self.model_store.get_model(model)
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
prompt=prompt,
@@ -292,7 +298,7 @@ class LiteLLMOpenAIMixin(
guided_choice=guided_choice,
prompt_logprobs=prompt_logprobs,
)
- return litellm.text_completion(**params)
+ return await litellm.atext_completion(**params)
async def openai_chat_completion(
self,
@@ -308,7 +314,7 @@ class LiteLLMOpenAIMixin(
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
- response_format: Optional[Dict[str, str]] = None,
+ response_format: Optional[OpenAIResponseFormatParam] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
@@ -319,8 +325,8 @@ class LiteLLMOpenAIMixin(
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
- ) -> OpenAIChatCompletion:
- model_obj = await self._get_model(model)
+ ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+ model_obj = await self.model_store.get_model(model)
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
messages=messages,
@@ -346,7 +352,7 @@ class LiteLLMOpenAIMixin(
top_p=top_p,
user=user,
)
- return litellm.completion(**params)
+ return await litellm.acompletion(**params)
async def batch_completion(
self,
diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py
index f33cb4443..d98261abb 100644
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@@ -8,7 +8,7 @@ import logging
import time
import uuid
import warnings
-from typing import Any, AsyncGenerator, Dict, Iterable, List, Optional, Union
+from typing import Any, AsyncGenerator, AsyncIterator, Awaitable, Dict, Iterable, List, Optional, Union
from openai import AsyncStream
from openai.types.chat import (
@@ -50,6 +50,18 @@ from openai.types.chat.chat_completion import (
from openai.types.chat.chat_completion import (
ChoiceLogprobs as OpenAIChoiceLogprobs, # same as chat_completion_chunk ChoiceLogprobs
)
+from openai.types.chat.chat_completion_chunk import (
+ Choice as OpenAIChatCompletionChunkChoice,
+)
+from openai.types.chat.chat_completion_chunk import (
+ ChoiceDelta as OpenAIChoiceDelta,
+)
+from openai.types.chat.chat_completion_chunk import (
+ ChoiceDeltaToolCall as OpenAIChoiceDeltaToolCall,
+)
+from openai.types.chat.chat_completion_chunk import (
+ ChoiceDeltaToolCallFunction as OpenAIChoiceDeltaToolCallFunction,
+)
from openai.types.chat.chat_completion_content_part_image_param import (
ImageURL as OpenAIImageURL,
)
@@ -59,6 +71,7 @@ from openai.types.chat.chat_completion_message_tool_call_param import (
from pydantic import BaseModel
from llama_stack.apis.common.content_types import (
+ URL,
ImageContentItem,
InterleavedContent,
TextContentItem,
@@ -85,12 +98,24 @@ from llama_stack.apis.inference import (
TopPSamplingStrategy,
UserMessage,
)
-from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAICompletionChoice
+from llama_stack.apis.inference.inference import (
+ JsonSchemaResponseFormat,
+ OpenAIChatCompletion,
+ OpenAICompletion,
+ OpenAICompletionChoice,
+ OpenAIMessageParam,
+ OpenAIResponseFormatParam,
+ ToolConfig,
+)
+from llama_stack.apis.inference.inference import (
+ OpenAIChoice as OpenAIChatCompletionChoice,
+)
from llama_stack.models.llama.datatypes import (
BuiltinTool,
StopReason,
ToolCall,
ToolDefinition,
+ ToolParamDefinition,
)
from llama_stack.providers.utils.inference.prompt_adapter import (
convert_image_content_to_url,
@@ -751,6 +776,17 @@ def convert_tooldef_to_openai_tool(tool: ToolDefinition) -> dict:
return out
+def _convert_stop_reason_to_openai_finish_reason(stop_reason: StopReason) -> str:
+ """
+ Convert a StopReason to an OpenAI chat completion finish_reason.
+ """
+ return {
+ StopReason.end_of_turn: "stop",
+ StopReason.end_of_message: "tool_calls",
+ StopReason.out_of_tokens: "length",
+ }.get(stop_reason, "stop")
+
+
def _convert_openai_finish_reason(finish_reason: str) -> StopReason:
"""
Convert an OpenAI chat completion finish_reason to a StopReason.
@@ -776,6 +812,56 @@ def _convert_openai_finish_reason(finish_reason: str) -> StopReason:
}.get(finish_reason, StopReason.end_of_turn)
+def _convert_openai_request_tool_config(tool_choice: Optional[Union[str, Dict[str, Any]]] = None) -> ToolConfig:
+ tool_config = ToolConfig()
+ if tool_choice:
+ tool_config.tool_choice = tool_choice
+ return tool_config
+
+
+def _convert_openai_request_tools(tools: Optional[List[Dict[str, Any]]] = None) -> List[ToolDefinition]:
+ lls_tools = []
+ if not tools:
+ return lls_tools
+
+ for tool in tools:
+ tool_fn = tool.get("function", {})
+ tool_name = tool_fn.get("name", None)
+ tool_desc = tool_fn.get("description", None)
+
+ tool_params = tool_fn.get("parameters", None)
+ lls_tool_params = {}
+ if tool_params is not None:
+ tool_param_properties = tool_params.get("properties", {})
+ for tool_param_key, tool_param_value in tool_param_properties.items():
+ tool_param_def = ToolParamDefinition(
+ param_type=tool_param_value.get("type", None),
+ description=tool_param_value.get("description", None),
+ )
+ lls_tool_params[tool_param_key] = tool_param_def
+
+ lls_tool = ToolDefinition(
+ tool_name=tool_name,
+ description=tool_desc,
+ parameters=lls_tool_params,
+ )
+ lls_tools.append(lls_tool)
+ return lls_tools
+
+
+def _convert_openai_request_response_format(response_format: OpenAIResponseFormatParam = None):
+ if not response_format:
+ return None
+ # response_format can be a dict or a pydantic model
+ response_format = dict(response_format)
+ if response_format.get("type", "") == "json_schema":
+ return JsonSchemaResponseFormat(
+ type="json_schema",
+ json_schema=response_format.get("json_schema", {}).get("schema", ""),
+ )
+ return None
+
+
def _convert_openai_tool_calls(
tool_calls: List[OpenAIChatCompletionMessageToolCall],
) -> List[ToolCall]:
@@ -871,6 +957,40 @@ def _convert_openai_sampling_params(
return sampling_params
+def _convert_openai_request_messages(messages: List[OpenAIMessageParam]):
+ # Llama Stack messages and OpenAI messages are similar, but not identical.
+ lls_messages = []
+ for message in messages:
+ lls_message = dict(message)
+
+ # Llama Stack expects `call_id` but OpenAI uses `tool_call_id`
+ tool_call_id = lls_message.pop("tool_call_id", None)
+ if tool_call_id:
+ lls_message["call_id"] = tool_call_id
+
+ content = lls_message.get("content", None)
+ if isinstance(content, list):
+ lls_content = []
+ for item in content:
+ # items can either by pydantic models or dicts here...
+ item = dict(item)
+ if item.get("type", "") == "image_url":
+ lls_item = ImageContentItem(
+ type="image",
+ image=URL(uri=item.get("image_url", {}).get("url", "")),
+ )
+ elif item.get("type", "") == "text":
+ lls_item = TextContentItem(
+ type="text",
+ text=item.get("text", ""),
+ )
+ lls_content.append(lls_item)
+ lls_message["content"] = lls_content
+ lls_messages.append(lls_message)
+
+ return lls_messages
+
+
def convert_openai_chat_completion_choice(
choice: OpenAIChoice,
) -> ChatCompletionResponse:
@@ -1080,11 +1200,24 @@ async def convert_openai_chat_completion_stream(
async def prepare_openai_completion_params(**params):
- completion_params = {k: v for k, v in params.items() if v is not None}
+ async def _prepare_value(value: Any) -> Any:
+ new_value = value
+ if isinstance(value, list):
+ new_value = [await _prepare_value(v) for v in value]
+ elif isinstance(value, dict):
+ new_value = {k: await _prepare_value(v) for k, v in value.items()}
+ elif isinstance(value, BaseModel):
+ new_value = value.model_dump(exclude_none=True)
+ return new_value
+
+ completion_params = {}
+ for k, v in params.items():
+ if v is not None:
+ completion_params[k] = await _prepare_value(v)
return completion_params
-class OpenAICompletionUnsupportedMixin:
+class OpenAICompletionToLlamaStackMixin:
async def openai_completion(
self,
model: str,
@@ -1122,6 +1255,7 @@ class OpenAICompletionUnsupportedMixin:
choices = []
# "n" is the number of completions to generate per prompt
+ n = n or 1
for _i in range(0, n):
# and we may have multiple prompts, if batching was used
@@ -1134,7 +1268,7 @@ class OpenAICompletionUnsupportedMixin:
index = len(choices)
text = result.content
- finish_reason = _convert_openai_finish_reason(result.stop_reason)
+ finish_reason = _convert_stop_reason_to_openai_finish_reason(result.stop_reason)
choice = OpenAICompletionChoice(
index=index,
@@ -1152,7 +1286,7 @@ class OpenAICompletionUnsupportedMixin:
)
-class OpenAIChatCompletionUnsupportedMixin:
+class OpenAIChatCompletionToLlamaStackMixin:
async def openai_chat_completion(
self,
model: str,
@@ -1167,7 +1301,7 @@ class OpenAIChatCompletionUnsupportedMixin:
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
- response_format: Optional[Dict[str, str]] = None,
+ response_format: Optional[OpenAIResponseFormatParam] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
@@ -1178,5 +1312,103 @@ class OpenAIChatCompletionUnsupportedMixin:
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
+ ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+ messages = _convert_openai_request_messages(messages)
+ response_format = _convert_openai_request_response_format(response_format)
+ sampling_params = _convert_openai_sampling_params(
+ max_tokens=max_tokens,
+ temperature=temperature,
+ top_p=top_p,
+ )
+ tool_config = _convert_openai_request_tool_config(tool_choice)
+ tools = _convert_openai_request_tools(tools)
+
+ outstanding_responses = []
+ # "n" is the number of completions to generate per prompt
+ n = n or 1
+ for _i in range(0, n):
+ response = self.chat_completion(
+ model_id=model,
+ messages=messages,
+ sampling_params=sampling_params,
+ response_format=response_format,
+ stream=stream,
+ tool_config=tool_config,
+ tools=tools,
+ )
+ outstanding_responses.append(response)
+
+ if stream:
+ return OpenAIChatCompletionToLlamaStackMixin._process_stream_response(self, model, outstanding_responses)
+
+ return await OpenAIChatCompletionToLlamaStackMixin._process_non_stream_response(
+ self, model, outstanding_responses
+ )
+
+ async def _process_stream_response(
+ self, model: str, outstanding_responses: List[Awaitable[AsyncIterator[ChatCompletionResponseStreamChunk]]]
+ ):
+ id = f"chatcmpl-{uuid.uuid4()}"
+ for outstanding_response in outstanding_responses:
+ response = await outstanding_response
+ i = 0
+ async for chunk in response:
+ event = chunk.event
+ finish_reason = _convert_stop_reason_to_openai_finish_reason(event.stop_reason)
+
+ if isinstance(event.delta, TextDelta):
+ text_delta = event.delta.text
+ delta = OpenAIChoiceDelta(content=text_delta)
+ yield OpenAIChatCompletionChunk(
+ id=id,
+ choices=[OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta)],
+ created=int(time.time()),
+ model=model,
+ object="chat.completion.chunk",
+ )
+ elif isinstance(event.delta, ToolCallDelta):
+ if event.delta.parse_status == ToolCallParseStatus.succeeded:
+ tool_call = event.delta.tool_call
+ openai_tool_call = OpenAIChoiceDeltaToolCall(
+ index=0,
+ id=tool_call.call_id,
+ function=OpenAIChoiceDeltaToolCallFunction(
+ name=tool_call.tool_name, arguments=tool_call.arguments_json
+ ),
+ )
+ delta = OpenAIChoiceDelta(tool_calls=[openai_tool_call])
+ yield OpenAIChatCompletionChunk(
+ id=id,
+ choices=[
+ OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta)
+ ],
+ created=int(time.time()),
+ model=model,
+ object="chat.completion.chunk",
+ )
+ i = i + 1
+
+ async def _process_non_stream_response(
+ self, model: str, outstanding_responses: List[Awaitable[ChatCompletionResponse]]
) -> OpenAIChatCompletion:
- raise ValueError(f"{self.__class__.__name__} doesn't support openai chat completion")
+ choices = []
+ for outstanding_response in outstanding_responses:
+ response = await outstanding_response
+ completion_message = response.completion_message
+ message = await convert_message_to_openai_dict_new(completion_message)
+ finish_reason = _convert_stop_reason_to_openai_finish_reason(completion_message.stop_reason)
+
+ choice = OpenAIChatCompletionChoice(
+ index=len(choices),
+ message=message,
+ finish_reason=finish_reason,
+ )
+ choices.append(choice)
+
+ return OpenAIChatCompletion(
+ id=f"chatcmpl-{uuid.uuid4()}",
+ choices=choices,
+ created=int(time.time()),
+ model=model,
+ object="chat.completion",
+ )
diff --git a/llama_stack/templates/dev/run.yaml b/llama_stack/templates/dev/run.yaml
index ea3b7252a..0dd056405 100644
--- a/llama_stack/templates/dev/run.yaml
+++ b/llama_stack/templates/dev/run.yaml
@@ -386,6 +386,16 @@ models:
provider_id: groq
provider_model_id: groq/llama-4-scout-17b-16e-instruct
model_type: llm
+- metadata: {}
+ model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
+ provider_id: groq
+ provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
+ model_type: llm
+- metadata: {}
+ model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
+ provider_id: groq
+ provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
+ model_type: llm
- metadata: {}
model_id: groq/llama-4-maverick-17b-128e-instruct
provider_id: groq
@@ -396,6 +406,16 @@ models:
provider_id: groq
provider_model_id: groq/llama-4-maverick-17b-128e-instruct
model_type: llm
+- metadata: {}
+ model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
+ provider_id: groq
+ provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
+ model_type: llm
+- metadata: {}
+ model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+ provider_id: groq
+ provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
+ model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
diff --git a/llama_stack/templates/groq/run.yaml b/llama_stack/templates/groq/run.yaml
index f557e64fd..444452dcb 100644
--- a/llama_stack/templates/groq/run.yaml
+++ b/llama_stack/templates/groq/run.yaml
@@ -158,6 +158,16 @@ models:
provider_id: groq
provider_model_id: groq/llama-4-scout-17b-16e-instruct
model_type: llm
+- metadata: {}
+ model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
+ provider_id: groq
+ provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
+ model_type: llm
+- metadata: {}
+ model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
+ provider_id: groq
+ provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
+ model_type: llm
- metadata: {}
model_id: groq/llama-4-maverick-17b-128e-instruct
provider_id: groq
@@ -168,6 +178,16 @@ models:
provider_id: groq
provider_model_id: groq/llama-4-maverick-17b-128e-instruct
model_type: llm
+- metadata: {}
+ model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
+ provider_id: groq
+ provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
+ model_type: llm
+- metadata: {}
+ model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+ provider_id: groq
+ provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
+ model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
diff --git a/llama_stack/templates/verification/run.yaml b/llama_stack/templates/verification/run.yaml
index b6c2ca98d..454ecba5b 100644
--- a/llama_stack/templates/verification/run.yaml
+++ b/llama_stack/templates/verification/run.yaml
@@ -474,6 +474,16 @@ models:
provider_id: groq-openai-compat
provider_model_id: groq/llama-4-scout-17b-16e-instruct
model_type: llm
+- metadata: {}
+ model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
+ provider_id: groq-openai-compat
+ provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
+ model_type: llm
+- metadata: {}
+ model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
+ provider_id: groq-openai-compat
+ provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
+ model_type: llm
- metadata: {}
model_id: groq/llama-4-maverick-17b-128e-instruct
provider_id: groq-openai-compat
@@ -484,6 +494,16 @@ models:
provider_id: groq-openai-compat
provider_model_id: groq/llama-4-maverick-17b-128e-instruct
model_type: llm
+- metadata: {}
+ model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
+ provider_id: groq-openai-compat
+ provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
+ model_type: llm
+- metadata: {}
+ model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+ provider_id: groq-openai-compat
+ provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
+ model_type: llm
- metadata: {}
model_id: Meta-Llama-3.1-8B-Instruct
provider_id: sambanova-openai-compat
diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py
index 0905d5817..75b53100c 100644
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@@ -115,7 +115,7 @@ def test_openai_completion_streaming(openai_client, client_with_models, text_mod
stream=True,
max_tokens=50,
)
- streamed_content = [chunk.choices[0].text for chunk in response]
+ streamed_content = [chunk.choices[0].text or "" for chunk in response]
content_str = "".join(streamed_content).lower().strip()
assert len(content_str) > 10
diff --git a/tests/verifications/conf/fireworks-llama-stack.yaml b/tests/verifications/conf/fireworks-llama-stack.yaml
new file mode 100644
index 000000000..d91443dd9
--- /dev/null
+++ b/tests/verifications/conf/fireworks-llama-stack.yaml
@@ -0,0 +1,14 @@
+base_url: http://localhost:8321/v1/openai/v1
+api_key_var: FIREWORKS_API_KEY
+models:
+- fireworks/llama-v3p3-70b-instruct
+- fireworks/llama4-scout-instruct-basic
+- fireworks/llama4-maverick-instruct-basic
+model_display_names:
+ fireworks/llama-v3p3-70b-instruct: Llama-3.3-70B-Instruct
+ fireworks/llama4-scout-instruct-basic: Llama-4-Scout-Instruct
+ fireworks/llama4-maverick-instruct-basic: Llama-4-Maverick-Instruct
+test_exclusions:
+ fireworks/llama-v3p3-70b-instruct:
+ - test_chat_non_streaming_image
+ - test_chat_streaming_image
diff --git a/tests/verifications/conf/groq-llama-stack.yaml b/tests/verifications/conf/groq-llama-stack.yaml
new file mode 100644
index 000000000..fd5e9abec
--- /dev/null
+++ b/tests/verifications/conf/groq-llama-stack.yaml
@@ -0,0 +1,14 @@
+base_url: http://localhost:8321/v1/openai/v1
+api_key_var: GROQ_API_KEY
+models:
+- groq/llama-3.3-70b-versatile
+- groq/llama-4-scout-17b-16e-instruct
+- groq/llama-4-maverick-17b-128e-instruct
+model_display_names:
+ groq/llama-3.3-70b-versatile: Llama-3.3-70B-Instruct
+ groq/llama-4-scout-17b-16e-instruct: Llama-4-Scout-Instruct
+ groq/llama-4-maverick-17b-128e-instruct: Llama-4-Maverick-Instruct
+test_exclusions:
+ groq/llama-3.3-70b-versatile:
+ - test_chat_non_streaming_image
+ - test_chat_streaming_image
diff --git a/tests/verifications/conf/groq.yaml b/tests/verifications/conf/groq.yaml
index 7871036dc..76b1244ae 100644
--- a/tests/verifications/conf/groq.yaml
+++ b/tests/verifications/conf/groq.yaml
@@ -2,12 +2,12 @@ base_url: https://api.groq.com/openai/v1
api_key_var: GROQ_API_KEY
models:
- llama-3.3-70b-versatile
-- llama-4-scout-17b-16e-instruct
-- llama-4-maverick-17b-128e-instruct
+- meta-llama/llama-4-scout-17b-16e-instruct
+- meta-llama/llama-4-maverick-17b-128e-instruct
model_display_names:
llama-3.3-70b-versatile: Llama-3.3-70B-Instruct
- llama-4-scout-17b-16e-instruct: Llama-4-Scout-Instruct
- llama-4-maverick-17b-128e-instruct: Llama-4-Maverick-Instruct
+ meta-llama/llama-4-scout-17b-16e-instruct: Llama-4-Scout-Instruct
+ meta-llama/llama-4-maverick-17b-128e-instruct: Llama-4-Maverick-Instruct
test_exclusions:
llama-3.3-70b-versatile:
- test_chat_non_streaming_image
diff --git a/tests/verifications/conf/openai-llama-stack.yaml b/tests/verifications/conf/openai-llama-stack.yaml
new file mode 100644
index 000000000..de35439ae
--- /dev/null
+++ b/tests/verifications/conf/openai-llama-stack.yaml
@@ -0,0 +1,9 @@
+base_url: http://localhost:8321/v1/openai/v1
+api_key_var: OPENAI_API_KEY
+models:
+- openai/gpt-4o
+- openai/gpt-4o-mini
+model_display_names:
+ openai/gpt-4o: gpt-4o
+ openai/gpt-4o-mini: gpt-4o-mini
+test_exclusions: {}
diff --git a/tests/verifications/conf/together-llama-stack.yaml b/tests/verifications/conf/together-llama-stack.yaml
new file mode 100644
index 000000000..e49d82604
--- /dev/null
+++ b/tests/verifications/conf/together-llama-stack.yaml
@@ -0,0 +1,14 @@
+base_url: http://localhost:8321/v1/openai/v1
+api_key_var: TOGETHER_API_KEY
+models:
+- together/meta-llama/Llama-3.3-70B-Instruct-Turbo
+- together/meta-llama/Llama-4-Scout-17B-16E-Instruct
+- together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
+model_display_names:
+ together/meta-llama/Llama-3.3-70B-Instruct-Turbo: Llama-3.3-70B-Instruct
+ together/meta-llama/Llama-4-Scout-17B-16E-Instruct: Llama-4-Scout-Instruct
+ together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8: Llama-4-Maverick-Instruct
+test_exclusions:
+ together/meta-llama/Llama-3.3-70B-Instruct-Turbo:
+ - test_chat_non_streaming_image
+ - test_chat_streaming_image
diff --git a/tests/verifications/generate_report.py b/tests/verifications/generate_report.py
index 6a7c39ee2..b39c3fd19 100755
--- a/tests/verifications/generate_report.py
+++ b/tests/verifications/generate_report.py
@@ -67,7 +67,17 @@ RESULTS_DIR.mkdir(exist_ok=True)
# Maximum number of test result files to keep per provider
MAX_RESULTS_PER_PROVIDER = 1
-PROVIDER_ORDER = ["together", "fireworks", "groq", "cerebras", "openai"]
+PROVIDER_ORDER = [
+ "together",
+ "fireworks",
+ "groq",
+ "cerebras",
+ "openai",
+ "together-llama-stack",
+ "fireworks-llama-stack",
+ "groq-llama-stack",
+ "openai-llama-stack",
+]
VERIFICATION_CONFIG = _load_all_verification_configs()
diff --git a/tests/verifications/openai-api-verification-run.yaml b/tests/verifications/openai-api-verification-run.yaml
new file mode 100644
index 000000000..71885d058
--- /dev/null
+++ b/tests/verifications/openai-api-verification-run.yaml
@@ -0,0 +1,146 @@
+version: '2'
+image_name: openai-api-verification
+apis:
+- inference
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+ inference:
+ - provider_id: together
+ provider_type: remote::together
+ config:
+ url: https://api.together.xyz/v1
+ api_key: ${env.TOGETHER_API_KEY:}
+ - provider_id: fireworks
+ provider_type: remote::fireworks
+ config:
+ url: https://api.fireworks.ai/inference/v1
+ api_key: ${env.FIREWORKS_API_KEY}
+ - provider_id: groq
+ provider_type: remote::groq
+ config:
+ url: https://api.groq.com
+ api_key: ${env.GROQ_API_KEY}
+ - provider_id: openai
+ provider_type: remote::openai
+ config:
+ url: https://api.openai.com/v1
+ api_key: ${env.OPENAI_API_KEY:}
+ - provider_id: sentence-transformers
+ provider_type: inline::sentence-transformers
+ config: {}
+ vector_io:
+ - provider_id: faiss
+ provider_type: inline::faiss
+ config:
+ kvstore:
+ type: sqlite
+ namespace: null
+ db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/openai}/faiss_store.db
+ telemetry:
+ - provider_id: meta-reference
+ provider_type: inline::meta-reference
+ config:
+ service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+ sinks: ${env.TELEMETRY_SINKS:console,sqlite}
+ sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/openai/trace_store.db}
+ tool_runtime:
+ - provider_id: brave-search
+ provider_type: remote::brave-search
+ config:
+ api_key: ${env.BRAVE_SEARCH_API_KEY:}
+ max_results: 3
+ - provider_id: tavily-search
+ provider_type: remote::tavily-search
+ config:
+ api_key: ${env.TAVILY_SEARCH_API_KEY:}
+ max_results: 3
+ - provider_id: code-interpreter
+ provider_type: inline::code-interpreter
+ config: {}
+ - provider_id: rag-runtime
+ provider_type: inline::rag-runtime
+ config: {}
+ - provider_id: model-context-protocol
+ provider_type: remote::model-context-protocol
+ config: {}
+ - provider_id: wolfram-alpha
+ provider_type: remote::wolfram-alpha
+ config:
+ api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
+metadata_store:
+ type: sqlite
+ db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/openai}/registry.db
+models:
+- metadata: {}
+ model_id: together/meta-llama/Llama-3.3-70B-Instruct-Turbo
+ provider_id: together
+ provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo
+ model_type: llm
+- metadata: {}
+ model_id: together/meta-llama/Llama-4-Scout-17B-16E-Instruct
+ provider_id: together
+ provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
+ model_type: llm
+- metadata: {}
+ model_id: together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
+ provider_id: together
+ provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
+ model_type: llm
+- metadata: {}
+ model_id: fireworks/llama-v3p3-70b-instruct
+ provider_id: fireworks
+ provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
+ model_type: llm
+- metadata: {}
+ model_id: fireworks/llama4-scout-instruct-basic
+ provider_id: fireworks
+ provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic
+ model_type: llm
+- metadata: {}
+ model_id: fireworks/llama4-maverick-instruct-basic
+ provider_id: fireworks
+ provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
+ model_type: llm
+- metadata: {}
+ model_id: groq/llama-3.3-70b-versatile
+ provider_id: groq
+ provider_model_id: groq/llama-3.3-70b-versatile
+ model_type: llm
+- metadata: {}
+ model_id: groq/llama-4-scout-17b-16e-instruct
+ provider_id: groq
+ provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
+ model_type: llm
+- metadata: {}
+ model_id: groq/llama-4-maverick-17b-128e-instruct
+ provider_id: groq
+ provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
+ model_type: llm
+- metadata: {}
+ model_id: openai/gpt-4o
+ provider_id: openai
+ provider_model_id: openai/gpt-4o
+ model_type: llm
+- metadata: {}
+ model_id: openai/gpt-4o-mini
+ provider_id: openai
+ provider_model_id: openai/gpt-4o-mini
+ model_type: llm
+shields: []
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+ provider_id: tavily-search
+- toolgroup_id: builtin::rag
+ provider_id: rag-runtime
+- toolgroup_id: builtin::code_interpreter
+ provider_id: code-interpreter
+- toolgroup_id: builtin::wolfram_alpha
+ provider_id: wolfram-alpha
+server:
+ port: 8321
diff --git a/tests/verifications/openai_api/fixtures/fixtures.py b/tests/verifications/openai_api/fixtures/fixtures.py
index 4f8c2e017..940b99b2a 100644
--- a/tests/verifications/openai_api/fixtures/fixtures.py
+++ b/tests/verifications/openai_api/fixtures/fixtures.py
@@ -99,6 +99,9 @@ def model_mapping(provider, providers_model_mapping):
@pytest.fixture
def openai_client(base_url, api_key):
+ # Simplify running against a local Llama Stack
+ if "localhost" in base_url and not api_key:
+ api_key = "empty"
return OpenAI(
base_url=base_url,
api_key=api_key,