diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index c85eb549f..54d888441 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -3096,11 +3096,18 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "Response from an OpenAI-compatible chat completion request. **OR** Chunk from a streaming response to an OpenAI-compatible chat completion request.", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OpenAIChatCompletion" + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIChatCompletion" + }, + { + "$ref": "#/components/schemas/OpenAIChatCompletionChunk" + } + ] } } } @@ -8857,7 +8864,17 @@ "description": "Must be \"assistant\" to identify this as the model's response" }, "content": { - "$ref": "#/components/schemas/InterleavedContent", + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" + } + } + ], "description": "The content of the model's response" }, "name": { @@ -8867,9 +8884,9 @@ "tool_calls": { "type": "array", "items": { - "$ref": "#/components/schemas/ToolCall" + "$ref": "#/components/schemas/OpenAIChatCompletionToolCall" }, - "description": "List of tool calls. Each tool call is a ToolCall object." + "description": "List of tool calls. Each tool call is an OpenAIChatCompletionToolCall object." } }, "additionalProperties": false, @@ -8880,6 +8897,98 @@ "title": "OpenAIAssistantMessageParam", "description": "A message containing the model's (assistant) response in an OpenAI-compatible chat completion request." }, + "OpenAIChatCompletionContentPartImageParam": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "image_url", + "default": "image_url" + }, + "image_url": { + "$ref": "#/components/schemas/OpenAIImageURL" + } + }, + "additionalProperties": false, + "required": [ + "type", + "image_url" + ], + "title": "OpenAIChatCompletionContentPartImageParam" + }, + "OpenAIChatCompletionContentPartParam": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam" + }, + { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "text": "#/components/schemas/OpenAIChatCompletionContentPartTextParam", + "image_url": "#/components/schemas/OpenAIChatCompletionContentPartImageParam" + } + } + }, + "OpenAIChatCompletionContentPartTextParam": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "text", + "default": "text" + }, + "text": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "type", + "text" + ], + "title": "OpenAIChatCompletionContentPartTextParam" + }, + "OpenAIChatCompletionToolCall": { + "type": "object", + "properties": { + "index": { + "type": "integer" + }, + "id": { + "type": "string" + }, + "type": { + "type": "string", + "const": "function", + "default": "function" + }, + "function": { + "$ref": "#/components/schemas/OpenAIChatCompletionToolCallFunction" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "OpenAIChatCompletionToolCall" + }, + "OpenAIChatCompletionToolCallFunction": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "additionalProperties": false, + "title": "OpenAIChatCompletionToolCallFunction" + }, "OpenAIDeveloperMessageParam": { "type": "object", "properties": { @@ -8890,7 +8999,17 @@ "description": "Must be \"developer\" to identify this as a developer message" }, "content": { - "$ref": "#/components/schemas/InterleavedContent", + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" + } + } + ], "description": "The content of the developer message" }, "name": { @@ -8906,6 +9025,66 @@ "title": "OpenAIDeveloperMessageParam", "description": "A message from the developer in an OpenAI-compatible chat completion request." }, + "OpenAIImageURL": { + "type": "object", + "properties": { + "url": { + "type": "string" + }, + "detail": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "url" + ], + "title": "OpenAIImageURL" + }, + "OpenAIJSONSchema": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "strict": { + "type": "boolean" + }, + "schema": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "name" + ], + "title": "OpenAIJSONSchema" + }, "OpenAIMessageParam": { "oneOf": [ { @@ -8935,6 +9114,76 @@ } } }, + "OpenAIResponseFormatJSONObject": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "json_object", + "default": "json_object" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "OpenAIResponseFormatJSONObject" + }, + "OpenAIResponseFormatJSONSchema": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "json_schema", + "default": "json_schema" + }, + "json_schema": { + "$ref": "#/components/schemas/OpenAIJSONSchema" + } + }, + "additionalProperties": false, + "required": [ + "type", + "json_schema" + ], + "title": "OpenAIResponseFormatJSONSchema" + }, + "OpenAIResponseFormatParam": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseFormatText" + }, + { + "$ref": "#/components/schemas/OpenAIResponseFormatJSONSchema" + }, + { + "$ref": "#/components/schemas/OpenAIResponseFormatJSONObject" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "text": "#/components/schemas/OpenAIResponseFormatText", + "json_schema": "#/components/schemas/OpenAIResponseFormatJSONSchema", + "json_object": "#/components/schemas/OpenAIResponseFormatJSONObject" + } + } + }, + "OpenAIResponseFormatText": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "text", + "default": "text" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "OpenAIResponseFormatText" + }, "OpenAISystemMessageParam": { "type": "object", "properties": { @@ -8945,7 +9194,17 @@ "description": "Must be \"system\" to identify this as a system message" }, "content": { - "$ref": "#/components/schemas/InterleavedContent", + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" + } + } + ], "description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)." }, "name": { @@ -8975,7 +9234,17 @@ "description": "Unique identifier for the tool call this response is for" }, "content": { - "$ref": "#/components/schemas/InterleavedContent", + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" + } + } + ], "description": "The response content from the tool" } }, @@ -8998,7 +9267,17 @@ "description": "Must be \"user\" to identify this as a user message" }, "content": { - "$ref": "#/components/schemas/InterleavedContent", + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" + } + } + ], "description": "The content of the message, which can include text and other media" }, "name": { @@ -9126,10 +9405,7 @@ "description": "(Optional) The penalty for repeated tokens" }, "response_format": { - "type": "object", - "additionalProperties": { - "type": "string" - }, + "$ref": "#/components/schemas/OpenAIResponseFormatParam", "description": "(Optional) The response format to use" }, "seed": { @@ -9306,6 +9582,46 @@ "title": "OpenAIChatCompletion", "description": "Response from an OpenAI-compatible chat completion request." }, + "OpenAIChatCompletionChunk": { + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "The ID of the chat completion" + }, + "choices": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChunkChoice" + }, + "description": "List of choices" + }, + "object": { + "type": "string", + "const": "chat.completion.chunk", + "default": "chat.completion.chunk", + "description": "The object type, which will be \"chat.completion.chunk\"" + }, + "created": { + "type": "integer", + "description": "The Unix timestamp in seconds when the chat completion was created" + }, + "model": { + "type": "string", + "description": "The model that was used to generate the chat completion" + } + }, + "additionalProperties": false, + "required": [ + "id", + "choices", + "object", + "created", + "model" + ], + "title": "OpenAIChatCompletionChunk", + "description": "Chunk from a streaming response to an OpenAI-compatible chat completion request." + }, "OpenAIChoice": { "type": "object", "properties": { @@ -9318,10 +9634,12 @@ "description": "The reason the model stopped generating" }, "index": { - "type": "integer" + "type": "integer", + "description": "The index of the choice" }, "logprobs": { - "$ref": "#/components/schemas/OpenAIChoiceLogprobs" + "$ref": "#/components/schemas/OpenAIChoiceLogprobs", + "description": "(Optional) The log probabilities for the tokens in the message" } }, "additionalProperties": false, @@ -9333,6 +9651,33 @@ "title": "OpenAIChoice", "description": "A choice from an OpenAI-compatible chat completion response." }, + "OpenAIChoiceDelta": { + "type": "object", + "properties": { + "content": { + "type": "string", + "description": "(Optional) The content of the delta" + }, + "refusal": { + "type": "string", + "description": "(Optional) The refusal of the delta" + }, + "role": { + "type": "string", + "description": "(Optional) The role of the delta" + }, + "tool_calls": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChatCompletionToolCall" + }, + "description": "(Optional) The tool calls of the delta" + } + }, + "additionalProperties": false, + "title": "OpenAIChoiceDelta", + "description": "A delta from an OpenAI-compatible chat completion streaming response." + }, "OpenAIChoiceLogprobs": { "type": "object", "properties": { @@ -9340,19 +9685,50 @@ "type": "array", "items": { "$ref": "#/components/schemas/OpenAITokenLogProb" - } + }, + "description": "(Optional) The log probabilities for the tokens in the message" }, "refusal": { "type": "array", "items": { "$ref": "#/components/schemas/OpenAITokenLogProb" - } + }, + "description": "(Optional) The log probabilities for the tokens in the message" } }, "additionalProperties": false, "title": "OpenAIChoiceLogprobs", "description": "The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response." }, + "OpenAIChunkChoice": { + "type": "object", + "properties": { + "delta": { + "$ref": "#/components/schemas/OpenAIChoiceDelta", + "description": "The delta from the chunk" + }, + "finish_reason": { + "type": "string", + "description": "The reason the model stopped generating" + }, + "index": { + "type": "integer", + "description": "The index of the choice" + }, + "logprobs": { + "$ref": "#/components/schemas/OpenAIChoiceLogprobs", + "description": "(Optional) The log probabilities for the tokens in the message" + } + }, + "additionalProperties": false, + "required": [ + "delta", + "finish_reason", + "index" + ], + "title": "OpenAIChunkChoice", + "description": "A chunk choice from an OpenAI-compatible chat completion streaming response." + }, "OpenAITokenLogProb": { "type": "object", "properties": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 6c99c9155..cf657bff9 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -2135,11 +2135,15 @@ paths: post: responses: '200': - description: OK + description: >- + Response from an OpenAI-compatible chat completion request. **OR** Chunk + from a streaming response to an OpenAI-compatible chat completion request. content: application/json: schema: - $ref: '#/components/schemas/OpenAIChatCompletion' + oneOf: + - $ref: '#/components/schemas/OpenAIChatCompletion' + - $ref: '#/components/schemas/OpenAIChatCompletionChunk' '400': $ref: '#/components/responses/BadRequest400' '429': @@ -6073,7 +6077,11 @@ components: description: >- Must be "assistant" to identify this as the model's response content: - $ref: '#/components/schemas/InterleavedContent' + oneOf: + - type: string + - type: array + items: + $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' description: The content of the model's response name: type: string @@ -6082,9 +6090,10 @@ components: tool_calls: type: array items: - $ref: '#/components/schemas/ToolCall' + $ref: '#/components/schemas/OpenAIChatCompletionToolCall' description: >- - List of tool calls. Each tool call is a ToolCall object. + List of tool calls. Each tool call is an OpenAIChatCompletionToolCall + object. additionalProperties: false required: - role @@ -6093,6 +6102,70 @@ components: description: >- A message containing the model's (assistant) response in an OpenAI-compatible chat completion request. + "OpenAIChatCompletionContentPartImageParam": + type: object + properties: + type: + type: string + const: image_url + default: image_url + image_url: + $ref: '#/components/schemas/OpenAIImageURL' + additionalProperties: false + required: + - type + - image_url + title: >- + OpenAIChatCompletionContentPartImageParam + OpenAIChatCompletionContentPartParam: + oneOf: + - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam' + discriminator: + propertyName: type + mapping: + text: '#/components/schemas/OpenAIChatCompletionContentPartTextParam' + image_url: '#/components/schemas/OpenAIChatCompletionContentPartImageParam' + OpenAIChatCompletionContentPartTextParam: + type: object + properties: + type: + type: string + const: text + default: text + text: + type: string + additionalProperties: false + required: + - type + - text + title: OpenAIChatCompletionContentPartTextParam + OpenAIChatCompletionToolCall: + type: object + properties: + index: + type: integer + id: + type: string + type: + type: string + const: function + default: function + function: + $ref: '#/components/schemas/OpenAIChatCompletionToolCallFunction' + additionalProperties: false + required: + - type + title: OpenAIChatCompletionToolCall + OpenAIChatCompletionToolCallFunction: + type: object + properties: + name: + type: string + arguments: + type: string + additionalProperties: false + title: OpenAIChatCompletionToolCallFunction OpenAIDeveloperMessageParam: type: object properties: @@ -6103,7 +6176,11 @@ components: description: >- Must be "developer" to identify this as a developer message content: - $ref: '#/components/schemas/InterleavedContent' + oneOf: + - type: string + - type: array + items: + $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' description: The content of the developer message name: type: string @@ -6116,6 +6193,40 @@ components: title: OpenAIDeveloperMessageParam description: >- A message from the developer in an OpenAI-compatible chat completion request. + OpenAIImageURL: + type: object + properties: + url: + type: string + detail: + type: string + additionalProperties: false + required: + - url + title: OpenAIImageURL + OpenAIJSONSchema: + type: object + properties: + name: + type: string + description: + type: string + strict: + type: boolean + schema: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - name + title: OpenAIJSONSchema OpenAIMessageParam: oneOf: - $ref: '#/components/schemas/OpenAIUserMessageParam' @@ -6131,6 +6242,53 @@ components: assistant: '#/components/schemas/OpenAIAssistantMessageParam' tool: '#/components/schemas/OpenAIToolMessageParam' developer: '#/components/schemas/OpenAIDeveloperMessageParam' + OpenAIResponseFormatJSONObject: + type: object + properties: + type: + type: string + const: json_object + default: json_object + additionalProperties: false + required: + - type + title: OpenAIResponseFormatJSONObject + OpenAIResponseFormatJSONSchema: + type: object + properties: + type: + type: string + const: json_schema + default: json_schema + json_schema: + $ref: '#/components/schemas/OpenAIJSONSchema' + additionalProperties: false + required: + - type + - json_schema + title: OpenAIResponseFormatJSONSchema + OpenAIResponseFormatParam: + oneOf: + - $ref: '#/components/schemas/OpenAIResponseFormatText' + - $ref: '#/components/schemas/OpenAIResponseFormatJSONSchema' + - $ref: '#/components/schemas/OpenAIResponseFormatJSONObject' + discriminator: + propertyName: type + mapping: + text: '#/components/schemas/OpenAIResponseFormatText' + json_schema: '#/components/schemas/OpenAIResponseFormatJSONSchema' + json_object: '#/components/schemas/OpenAIResponseFormatJSONObject' + OpenAIResponseFormatText: + type: object + properties: + type: + type: string + const: text + default: text + additionalProperties: false + required: + - type + title: OpenAIResponseFormatText OpenAISystemMessageParam: type: object properties: @@ -6141,7 +6299,11 @@ components: description: >- Must be "system" to identify this as a system message content: - $ref: '#/components/schemas/InterleavedContent' + oneOf: + - type: string + - type: array + items: + $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' description: >- The content of the "system prompt". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other @@ -6171,7 +6333,11 @@ components: description: >- Unique identifier for the tool call this response is for content: - $ref: '#/components/schemas/InterleavedContent' + oneOf: + - type: string + - type: array + items: + $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' description: The response content from the tool additionalProperties: false required: @@ -6192,7 +6358,11 @@ components: description: >- Must be "user" to identify this as a user message content: - $ref: '#/components/schemas/InterleavedContent' + oneOf: + - type: string + - type: array + items: + $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' description: >- The content of the message, which can include text and other media name: @@ -6278,9 +6448,7 @@ components: description: >- (Optional) The penalty for repeated tokens response_format: - type: object - additionalProperties: - type: string + $ref: '#/components/schemas/OpenAIResponseFormatParam' description: (Optional) The response format to use seed: type: integer @@ -6386,6 +6554,41 @@ components: title: OpenAIChatCompletion description: >- Response from an OpenAI-compatible chat completion request. + OpenAIChatCompletionChunk: + type: object + properties: + id: + type: string + description: The ID of the chat completion + choices: + type: array + items: + $ref: '#/components/schemas/OpenAIChunkChoice' + description: List of choices + object: + type: string + const: chat.completion.chunk + default: chat.completion.chunk + description: >- + The object type, which will be "chat.completion.chunk" + created: + type: integer + description: >- + The Unix timestamp in seconds when the chat completion was created + model: + type: string + description: >- + The model that was used to generate the chat completion + additionalProperties: false + required: + - id + - choices + - object + - created + - model + title: OpenAIChatCompletionChunk + description: >- + Chunk from a streaming response to an OpenAI-compatible chat completion request. OpenAIChoice: type: object properties: @@ -6397,8 +6600,11 @@ components: description: The reason the model stopped generating index: type: integer + description: The index of the choice logprobs: $ref: '#/components/schemas/OpenAIChoiceLogprobs' + description: >- + (Optional) The log probabilities for the tokens in the message additionalProperties: false required: - message @@ -6407,6 +6613,27 @@ components: title: OpenAIChoice description: >- A choice from an OpenAI-compatible chat completion response. + OpenAIChoiceDelta: + type: object + properties: + content: + type: string + description: (Optional) The content of the delta + refusal: + type: string + description: (Optional) The refusal of the delta + role: + type: string + description: (Optional) The role of the delta + tool_calls: + type: array + items: + $ref: '#/components/schemas/OpenAIChatCompletionToolCall' + description: (Optional) The tool calls of the delta + additionalProperties: false + title: OpenAIChoiceDelta + description: >- + A delta from an OpenAI-compatible chat completion streaming response. OpenAIChoiceLogprobs: type: object properties: @@ -6414,15 +6641,43 @@ components: type: array items: $ref: '#/components/schemas/OpenAITokenLogProb' + description: >- + (Optional) The log probabilities for the tokens in the message refusal: type: array items: $ref: '#/components/schemas/OpenAITokenLogProb' + description: >- + (Optional) The log probabilities for the tokens in the message additionalProperties: false title: OpenAIChoiceLogprobs description: >- The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response. + OpenAIChunkChoice: + type: object + properties: + delta: + $ref: '#/components/schemas/OpenAIChoiceDelta' + description: The delta from the chunk + finish_reason: + type: string + description: The reason the model stopped generating + index: + type: integer + description: The index of the choice + logprobs: + $ref: '#/components/schemas/OpenAIChoiceLogprobs' + description: >- + (Optional) The log probabilities for the tokens in the message + additionalProperties: false + required: + - delta + - finish_reason + - index + title: OpenAIChunkChoice + description: >- + A chunk choice from an OpenAI-compatible chat completion streaming response. OpenAITokenLogProb: type: object properties: diff --git a/docs/source/distributions/self_hosted_distro/groq.md b/docs/source/distributions/self_hosted_distro/groq.md index 4f5a8a859..b18be1b2f 100644 --- a/docs/source/distributions/self_hosted_distro/groq.md +++ b/docs/source/distributions/self_hosted_distro/groq.md @@ -43,7 +43,9 @@ The following models are available by default: - `groq/llama-3.3-70b-versatile (aliases: meta-llama/Llama-3.3-70B-Instruct)` - `groq/llama-3.2-3b-preview (aliases: meta-llama/Llama-3.2-3B-Instruct)` - `groq/llama-4-scout-17b-16e-instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)` +- `groq/meta-llama/llama-4-scout-17b-16e-instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)` - `groq/llama-4-maverick-17b-128e-instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)` +- `groq/meta-llama/llama-4-maverick-17b-128e-instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)` ### Prerequisite: API Keys diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 21753ca23..596efb136 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -18,7 +18,7 @@ from typing import ( ) from pydantic import BaseModel, Field, field_validator -from typing_extensions import Annotated +from typing_extensions import Annotated, TypedDict from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent, InterleavedContentItem from llama_stack.apis.models import Model @@ -442,6 +442,37 @@ class EmbeddingsResponse(BaseModel): embeddings: List[List[float]] +@json_schema_type +class OpenAIChatCompletionContentPartTextParam(BaseModel): + type: Literal["text"] = "text" + text: str + + +@json_schema_type +class OpenAIImageURL(BaseModel): + url: str + detail: Optional[str] = None + + +@json_schema_type +class OpenAIChatCompletionContentPartImageParam(BaseModel): + type: Literal["image_url"] = "image_url" + image_url: OpenAIImageURL + + +OpenAIChatCompletionContentPartParam = Annotated[ + Union[ + OpenAIChatCompletionContentPartTextParam, + OpenAIChatCompletionContentPartImageParam, + ], + Field(discriminator="type"), +] +register_schema(OpenAIChatCompletionContentPartParam, name="OpenAIChatCompletionContentPartParam") + + +OpenAIChatCompletionMessageContent = Union[str, List[OpenAIChatCompletionContentPartParam]] + + @json_schema_type class OpenAIUserMessageParam(BaseModel): """A message from the user in an OpenAI-compatible chat completion request. @@ -452,7 +483,7 @@ class OpenAIUserMessageParam(BaseModel): """ role: Literal["user"] = "user" - content: InterleavedContent + content: OpenAIChatCompletionMessageContent name: Optional[str] = None @@ -466,10 +497,24 @@ class OpenAISystemMessageParam(BaseModel): """ role: Literal["system"] = "system" - content: InterleavedContent + content: OpenAIChatCompletionMessageContent name: Optional[str] = None +@json_schema_type +class OpenAIChatCompletionToolCallFunction(BaseModel): + name: Optional[str] = None + arguments: Optional[str] = None + + +@json_schema_type +class OpenAIChatCompletionToolCall(BaseModel): + index: Optional[int] = None + id: Optional[str] = None + type: Literal["function"] = "function" + function: Optional[OpenAIChatCompletionToolCallFunction] = None + + @json_schema_type class OpenAIAssistantMessageParam(BaseModel): """A message containing the model's (assistant) response in an OpenAI-compatible chat completion request. @@ -477,13 +522,13 @@ class OpenAIAssistantMessageParam(BaseModel): :param role: Must be "assistant" to identify this as the model's response :param content: The content of the model's response :param name: (Optional) The name of the assistant message participant. - :param tool_calls: List of tool calls. Each tool call is a ToolCall object. + :param tool_calls: List of tool calls. Each tool call is an OpenAIChatCompletionToolCall object. """ role: Literal["assistant"] = "assistant" - content: InterleavedContent + content: OpenAIChatCompletionMessageContent name: Optional[str] = None - tool_calls: Optional[List[ToolCall]] = Field(default_factory=list) + tool_calls: Optional[List[OpenAIChatCompletionToolCall]] = Field(default_factory=list) @json_schema_type @@ -497,7 +542,7 @@ class OpenAIToolMessageParam(BaseModel): role: Literal["tool"] = "tool" tool_call_id: str - content: InterleavedContent + content: OpenAIChatCompletionMessageContent @json_schema_type @@ -510,7 +555,7 @@ class OpenAIDeveloperMessageParam(BaseModel): """ role: Literal["developer"] = "developer" - content: InterleavedContent + content: OpenAIChatCompletionMessageContent name: Optional[str] = None @@ -527,6 +572,46 @@ OpenAIMessageParam = Annotated[ register_schema(OpenAIMessageParam, name="OpenAIMessageParam") +@json_schema_type +class OpenAIResponseFormatText(BaseModel): + type: Literal["text"] = "text" + + +@json_schema_type +class OpenAIJSONSchema(TypedDict, total=False): + name: str + description: Optional[str] = None + strict: Optional[bool] = None + + # Pydantic BaseModel cannot be used with a schema param, since it already + # has one. And, we don't want to alias here because then have to handle + # that alias when converting to OpenAI params. So, to support schema, + # we use a TypedDict. + schema: Optional[Dict[str, Any]] = None + + +@json_schema_type +class OpenAIResponseFormatJSONSchema(BaseModel): + type: Literal["json_schema"] = "json_schema" + json_schema: OpenAIJSONSchema + + +@json_schema_type +class OpenAIResponseFormatJSONObject(BaseModel): + type: Literal["json_object"] = "json_object" + + +OpenAIResponseFormatParam = Annotated[ + Union[ + OpenAIResponseFormatText, + OpenAIResponseFormatJSONSchema, + OpenAIResponseFormatJSONObject, + ], + Field(discriminator="type"), +] +register_schema(OpenAIResponseFormatParam, name="OpenAIResponseFormatParam") + + @json_schema_type class OpenAITopLogProb(BaseModel): """The top log probability for a token from an OpenAI-compatible chat completion response. @@ -561,22 +646,54 @@ class OpenAITokenLogProb(BaseModel): class OpenAIChoiceLogprobs(BaseModel): """The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response. - :content: (Optional) The log probabilities for the tokens in the message - :refusal: (Optional) The log probabilities for the tokens in the message + :param content: (Optional) The log probabilities for the tokens in the message + :param refusal: (Optional) The log probabilities for the tokens in the message """ content: Optional[List[OpenAITokenLogProb]] = None refusal: Optional[List[OpenAITokenLogProb]] = None +@json_schema_type +class OpenAIChoiceDelta(BaseModel): + """A delta from an OpenAI-compatible chat completion streaming response. + + :param content: (Optional) The content of the delta + :param refusal: (Optional) The refusal of the delta + :param role: (Optional) The role of the delta + :param tool_calls: (Optional) The tool calls of the delta + """ + + content: Optional[str] = None + refusal: Optional[str] = None + role: Optional[str] = None + tool_calls: Optional[List[OpenAIChatCompletionToolCall]] = None + + +@json_schema_type +class OpenAIChunkChoice(BaseModel): + """A chunk choice from an OpenAI-compatible chat completion streaming response. + + :param delta: The delta from the chunk + :param finish_reason: The reason the model stopped generating + :param index: The index of the choice + :param logprobs: (Optional) The log probabilities for the tokens in the message + """ + + delta: OpenAIChoiceDelta + finish_reason: str + index: int + logprobs: Optional[OpenAIChoiceLogprobs] = None + + @json_schema_type class OpenAIChoice(BaseModel): """A choice from an OpenAI-compatible chat completion response. :param message: The message from the model :param finish_reason: The reason the model stopped generating - :index: The index of the choice - :logprobs: (Optional) The log probabilities for the tokens in the message + :param index: The index of the choice + :param logprobs: (Optional) The log probabilities for the tokens in the message """ message: OpenAIMessageParam @@ -603,6 +720,24 @@ class OpenAIChatCompletion(BaseModel): model: str +@json_schema_type +class OpenAIChatCompletionChunk(BaseModel): + """Chunk from a streaming response to an OpenAI-compatible chat completion request. + + :param id: The ID of the chat completion + :param choices: List of choices + :param object: The object type, which will be "chat.completion.chunk" + :param created: The Unix timestamp in seconds when the chat completion was created + :param model: The model that was used to generate the chat completion + """ + + id: str + choices: List[OpenAIChunkChoice] + object: Literal["chat.completion.chunk"] = "chat.completion.chunk" + created: int + model: str + + @json_schema_type class OpenAICompletionLogprobs(BaseModel): """The log probabilities for the tokens in the message from an OpenAI-compatible completion response. @@ -872,7 +1007,7 @@ class Inference(Protocol): n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, @@ -883,7 +1018,7 @@ class Inference(Protocol): top_logprobs: Optional[int] = None, top_p: Optional[float] = None, user: Optional[str] = None, - ) -> OpenAIChatCompletion: + ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]: """Generate an OpenAI-compatible chat completion for the given messages using the specified model. :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint. diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index cdf91e052..17aecdaf8 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -38,7 +38,13 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) -from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletion, + OpenAIChatCompletionChunk, + OpenAICompletion, + OpenAIMessageParam, + OpenAIResponseFormatParam, +) from llama_stack.apis.models import Model, ModelType from llama_stack.apis.safety import RunShieldResponse, Safety from llama_stack.apis.scoring import ( @@ -531,7 +537,7 @@ class InferenceRouter(Inference): n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, @@ -542,7 +548,7 @@ class InferenceRouter(Inference): top_logprobs: Optional[int] = None, top_p: Optional[float] = None, user: Optional[str] = None, - ) -> OpenAIChatCompletion: + ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]: logger.debug( f"InferenceRouter.openai_chat_completion: {model=}, {stream=}, {messages=}", ) diff --git a/llama_stack/models/llama/llama3/tool_utils.py b/llama_stack/models/llama/llama3/tool_utils.py index ef39ba0a5..91b46ec98 100644 --- a/llama_stack/models/llama/llama3/tool_utils.py +++ b/llama_stack/models/llama/llama3/tool_utils.py @@ -204,7 +204,9 @@ class ToolUtils: return None elif is_json(message_body): response = json.loads(message_body) - if ("type" in response and response["type"] == "function") or ("name" in response): + if ("type" in response and response["type"] == "function") or ( + "name" in response and "parameters" in response + ): function_name = response["name"] args = response["parameters"] return function_name, args diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py index 0b56ba1f7..2b9a27982 100644 --- a/llama_stack/providers/inline/inference/meta_reference/inference.py +++ b/llama_stack/providers/inline/inference/meta_reference/inference.py @@ -59,8 +59,8 @@ from llama_stack.providers.utils.inference.model_registry import ( build_hf_repo_model_entry, ) from llama_stack.providers.utils.inference.openai_compat import ( - OpenAIChatCompletionUnsupportedMixin, - OpenAICompletionUnsupportedMixin, + OpenAIChatCompletionToLlamaStackMixin, + OpenAICompletionToLlamaStackMixin, ) from llama_stack.providers.utils.inference.prompt_adapter import ( augment_content_with_response_format_prompt, @@ -83,8 +83,8 @@ def llama_builder_fn(config: MetaReferenceInferenceConfig, model_id: str, llama_ class MetaReferenceInferenceImpl( - OpenAICompletionUnsupportedMixin, - OpenAIChatCompletionUnsupportedMixin, + OpenAICompletionToLlamaStackMixin, + OpenAIChatCompletionToLlamaStackMixin, SentenceTransformerEmbeddingMixin, Inference, ModelsProtocolPrivate, diff --git a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py index 5bc20e3c2..d717d055f 100644 --- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py @@ -25,8 +25,8 @@ from llama_stack.providers.utils.inference.embedding_mixin import ( SentenceTransformerEmbeddingMixin, ) from llama_stack.providers.utils.inference.openai_compat import ( - OpenAIChatCompletionUnsupportedMixin, - OpenAICompletionUnsupportedMixin, + OpenAIChatCompletionToLlamaStackMixin, + OpenAICompletionToLlamaStackMixin, ) from .config import SentenceTransformersInferenceConfig @@ -35,8 +35,8 @@ log = logging.getLogger(__name__) class SentenceTransformersInferenceImpl( - OpenAIChatCompletionUnsupportedMixin, - OpenAICompletionUnsupportedMixin, + OpenAIChatCompletionToLlamaStackMixin, + OpenAICompletionToLlamaStackMixin, SentenceTransformerEmbeddingMixin, Inference, ModelsProtocolPrivate, diff --git a/llama_stack/providers/inline/inference/vllm/vllm.py b/llama_stack/providers/inline/inference/vllm/vllm.py index 085c79d6b..9d742c39c 100644 --- a/llama_stack/providers/inline/inference/vllm/vllm.py +++ b/llama_stack/providers/inline/inference/vllm/vllm.py @@ -66,10 +66,10 @@ from llama_stack.providers.utils.inference.model_registry import ( ModelsProtocolPrivate, ) from llama_stack.providers.utils.inference.openai_compat import ( - OpenAIChatCompletionUnsupportedMixin, + OpenAIChatCompletionToLlamaStackMixin, OpenAICompatCompletionChoice, OpenAICompatCompletionResponse, - OpenAICompletionUnsupportedMixin, + OpenAICompletionToLlamaStackMixin, get_stop_reason, process_chat_completion_stream_response, ) @@ -176,8 +176,8 @@ def _convert_sampling_params( class VLLMInferenceImpl( Inference, - OpenAIChatCompletionUnsupportedMixin, - OpenAICompletionUnsupportedMixin, + OpenAIChatCompletionToLlamaStackMixin, + OpenAICompletionToLlamaStackMixin, ModelsProtocolPrivate, ): """ diff --git a/llama_stack/providers/remote/inference/bedrock/bedrock.py b/llama_stack/providers/remote/inference/bedrock/bedrock.py index 0a485da8f..f8dbcf31a 100644 --- a/llama_stack/providers/remote/inference/bedrock/bedrock.py +++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py @@ -36,10 +36,10 @@ from llama_stack.providers.utils.inference.model_registry import ( ModelRegistryHelper, ) from llama_stack.providers.utils.inference.openai_compat import ( - OpenAIChatCompletionUnsupportedMixin, + OpenAIChatCompletionToLlamaStackMixin, OpenAICompatCompletionChoice, OpenAICompatCompletionResponse, - OpenAICompletionUnsupportedMixin, + OpenAICompletionToLlamaStackMixin, get_sampling_strategy_options, process_chat_completion_response, process_chat_completion_stream_response, @@ -56,8 +56,8 @@ from .models import MODEL_ENTRIES class BedrockInferenceAdapter( ModelRegistryHelper, Inference, - OpenAIChatCompletionUnsupportedMixin, - OpenAICompletionUnsupportedMixin, + OpenAIChatCompletionToLlamaStackMixin, + OpenAICompletionToLlamaStackMixin, ): def __init__(self, config: BedrockConfig) -> None: ModelRegistryHelper.__init__(self, MODEL_ENTRIES) diff --git a/llama_stack/providers/remote/inference/cerebras/cerebras.py b/llama_stack/providers/remote/inference/cerebras/cerebras.py index 5e0a5b484..3156601be 100644 --- a/llama_stack/providers/remote/inference/cerebras/cerebras.py +++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py @@ -34,8 +34,8 @@ from llama_stack.providers.utils.inference.model_registry import ( ModelRegistryHelper, ) from llama_stack.providers.utils.inference.openai_compat import ( - OpenAIChatCompletionUnsupportedMixin, - OpenAICompletionUnsupportedMixin, + OpenAIChatCompletionToLlamaStackMixin, + OpenAICompletionToLlamaStackMixin, get_sampling_options, process_chat_completion_response, process_chat_completion_stream_response, @@ -54,8 +54,8 @@ from .models import MODEL_ENTRIES class CerebrasInferenceAdapter( ModelRegistryHelper, Inference, - OpenAIChatCompletionUnsupportedMixin, - OpenAICompletionUnsupportedMixin, + OpenAIChatCompletionToLlamaStackMixin, + OpenAICompletionToLlamaStackMixin, ): def __init__(self, config: CerebrasImplConfig) -> None: ModelRegistryHelper.__init__( diff --git a/llama_stack/providers/remote/inference/databricks/databricks.py b/llama_stack/providers/remote/inference/databricks/databricks.py index a10878b27..27d96eb7d 100644 --- a/llama_stack/providers/remote/inference/databricks/databricks.py +++ b/llama_stack/providers/remote/inference/databricks/databricks.py @@ -34,8 +34,8 @@ from llama_stack.providers.utils.inference.model_registry import ( build_hf_repo_model_entry, ) from llama_stack.providers.utils.inference.openai_compat import ( - OpenAIChatCompletionUnsupportedMixin, - OpenAICompletionUnsupportedMixin, + OpenAIChatCompletionToLlamaStackMixin, + OpenAICompletionToLlamaStackMixin, get_sampling_options, process_chat_completion_response, process_chat_completion_stream_response, @@ -61,8 +61,8 @@ model_entries = [ class DatabricksInferenceAdapter( ModelRegistryHelper, Inference, - OpenAIChatCompletionUnsupportedMixin, - OpenAICompletionUnsupportedMixin, + OpenAIChatCompletionToLlamaStackMixin, + OpenAICompletionToLlamaStackMixin, ): def __init__(self, config: DatabricksImplConfig) -> None: ModelRegistryHelper.__init__(self, model_entries=model_entries) diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py index b59e9f2cb..48c163c87 100644 --- a/llama_stack/providers/remote/inference/fireworks/fireworks.py +++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Any, AsyncGenerator, Dict, List, Optional, Union +from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union from fireworks.client import Fireworks from openai import AsyncOpenAI @@ -32,13 +32,20 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) -from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletion, + OpenAIChatCompletionChunk, + OpenAICompletion, + OpenAIMessageParam, + OpenAIResponseFormatParam, +) from llama_stack.distribution.request_headers import NeedsRequestProviderData from llama_stack.log import get_logger from llama_stack.providers.utils.inference.model_registry import ( ModelRegistryHelper, ) from llama_stack.providers.utils.inference.openai_compat import ( + OpenAIChatCompletionToLlamaStackMixin, convert_message_to_openai_dict, get_sampling_options, prepare_openai_completion_params, @@ -301,6 +308,11 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv prompt_logprobs: Optional[int] = None, ) -> OpenAICompletion: model_obj = await self.model_store.get_model(model) + + # Fireworks always prepends with BOS + if isinstance(prompt, str) and prompt.startswith("<|begin_of_text|>"): + prompt = prompt[len("<|begin_of_text|>") :] + params = await prepare_openai_completion_params( model=model_obj.provider_resource_id, prompt=prompt, @@ -320,6 +332,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv top_p=top_p, user=user, ) + return await self._get_openai_client().completions.create(**params) async def openai_chat_completion( @@ -336,7 +349,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, @@ -347,10 +360,9 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv top_logprobs: Optional[int] = None, top_p: Optional[float] = None, user: Optional[str] = None, - ) -> OpenAIChatCompletion: + ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]: model_obj = await self.model_store.get_model(model) params = await prepare_openai_completion_params( - model=model_obj.provider_resource_id, messages=messages, frequency_penalty=frequency_penalty, function_call=function_call, @@ -374,4 +386,12 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv top_p=top_p, user=user, ) - return await self._get_openai_client().chat.completions.create(**params) + + # Divert Llama Models through Llama Stack inference APIs because + # Fireworks chat completions OpenAI-compatible API does not support + # tool calls properly. + llama_model = self.get_llama_model(model_obj.provider_resource_id) + if llama_model: + return await OpenAIChatCompletionToLlamaStackMixin.openai_chat_completion(self, model=model, **params) + + return await self._get_openai_client().chat.completions.create(model=model_obj.provider_resource_id, **params) diff --git a/llama_stack/providers/remote/inference/groq/groq.py b/llama_stack/providers/remote/inference/groq/groq.py index c8789434f..f3f14e9af 100644 --- a/llama_stack/providers/remote/inference/groq/groq.py +++ b/llama_stack/providers/remote/inference/groq/groq.py @@ -4,8 +4,24 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, AsyncIterator, Dict, List, Optional, Union + +from openai import AsyncOpenAI + +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletion, + OpenAIChatCompletionChunk, + OpenAIChoiceDelta, + OpenAIChunkChoice, + OpenAIMessageParam, + OpenAIResponseFormatParam, + OpenAISystemMessageParam, +) from llama_stack.providers.remote.inference.groq.config import GroqConfig from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin +from llama_stack.providers.utils.inference.openai_compat import ( + prepare_openai_completion_params, +) from .models import MODEL_ENTRIES @@ -21,9 +37,129 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin): provider_data_api_key_field="groq_api_key", ) self.config = config + self._openai_client = None async def initialize(self): await super().initialize() async def shutdown(self): await super().shutdown() + if self._openai_client: + await self._openai_client.close() + self._openai_client = None + + def _get_openai_client(self) -> AsyncOpenAI: + if not self._openai_client: + self._openai_client = AsyncOpenAI( + base_url=f"{self.config.url}/openai/v1", + api_key=self.config.api_key, + ) + return self._openai_client + + async def openai_chat_completion( + self, + model: str, + messages: List[OpenAIMessageParam], + frequency_penalty: Optional[float] = None, + function_call: Optional[Union[str, Dict[str, Any]]] = None, + functions: Optional[List[Dict[str, Any]]] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_completion_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + parallel_tool_calls: Optional[bool] = None, + presence_penalty: Optional[float] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + top_logprobs: Optional[int] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]: + model_obj = await self.model_store.get_model(model) + + # Groq does not support json_schema response format, so we need to convert it to json_object + if response_format and response_format.type == "json_schema": + response_format.type = "json_object" + schema = response_format.json_schema.get("schema", {}) + response_format.json_schema = None + json_instructions = f"\nYour response should be a JSON object that matches the following schema: {schema}" + if messages and messages[0].role == "system": + messages[0].content = messages[0].content + json_instructions + else: + messages.insert(0, OpenAISystemMessageParam(content=json_instructions)) + + # Groq returns a 400 error if tools are provided but none are called + # So, set tool_choice to "required" to attempt to force a call + if tools and (not tool_choice or tool_choice == "auto"): + tool_choice = "required" + + params = await prepare_openai_completion_params( + model=model_obj.provider_resource_id.replace("groq/", ""), + messages=messages, + frequency_penalty=frequency_penalty, + function_call=function_call, + functions=functions, + logit_bias=logit_bias, + logprobs=logprobs, + max_completion_tokens=max_completion_tokens, + max_tokens=max_tokens, + n=n, + parallel_tool_calls=parallel_tool_calls, + presence_penalty=presence_penalty, + response_format=response_format, + seed=seed, + stop=stop, + stream=stream, + stream_options=stream_options, + temperature=temperature, + tool_choice=tool_choice, + tools=tools, + top_logprobs=top_logprobs, + top_p=top_p, + user=user, + ) + + # Groq does not support streaming requests that set response_format + fake_stream = False + if stream and response_format: + params["stream"] = False + fake_stream = True + + response = await self._get_openai_client().chat.completions.create(**params) + + if fake_stream: + chunk_choices = [] + for choice in response.choices: + delta = OpenAIChoiceDelta( + content=choice.message.content, + role=choice.message.role, + tool_calls=choice.message.tool_calls, + ) + chunk_choice = OpenAIChunkChoice( + delta=delta, + finish_reason=choice.finish_reason, + index=choice.index, + logprobs=None, + ) + chunk_choices.append(chunk_choice) + chunk = OpenAIChatCompletionChunk( + id=response.id, + choices=chunk_choices, + object="chat.completion.chunk", + created=response.created, + model=response.model, + ) + + async def _fake_stream_generator(): + yield chunk + + return _fake_stream_generator() + else: + return response diff --git a/llama_stack/providers/remote/inference/groq/models.py b/llama_stack/providers/remote/inference/groq/models.py index d0c10ca62..0b4b81cfe 100644 --- a/llama_stack/providers/remote/inference/groq/models.py +++ b/llama_stack/providers/remote/inference/groq/models.py @@ -39,8 +39,16 @@ MODEL_ENTRIES = [ "groq/llama-4-scout-17b-16e-instruct", CoreModelId.llama4_scout_17b_16e_instruct.value, ), + build_hf_repo_model_entry( + "groq/meta-llama/llama-4-scout-17b-16e-instruct", + CoreModelId.llama4_scout_17b_16e_instruct.value, + ), build_hf_repo_model_entry( "groq/llama-4-maverick-17b-128e-instruct", CoreModelId.llama4_maverick_17b_128e_instruct.value, ), + build_hf_repo_model_entry( + "groq/meta-llama/llama-4-maverick-17b-128e-instruct", + CoreModelId.llama4_maverick_17b_128e_instruct.value, + ), ] diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py index d6f717719..15f0e72a1 100644 --- a/llama_stack/providers/remote/inference/nvidia/nvidia.py +++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py @@ -35,7 +35,13 @@ from llama_stack.apis.inference import ( ToolConfig, ToolDefinition, ) -from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletion, + OpenAIChatCompletionChunk, + OpenAICompletion, + OpenAIMessageParam, + OpenAIResponseFormatParam, +) from llama_stack.models.llama.datatypes import ToolPromptFormat from llama_stack.providers.utils.inference.model_registry import ( ModelRegistryHelper, @@ -329,7 +335,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper): n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, @@ -340,7 +346,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper): top_logprobs: Optional[int] = None, top_p: Optional[float] = None, user: Optional[str] = None, - ) -> OpenAIChatCompletion: + ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]: provider_model_id = self.get_provider_model_id(model) params = await prepare_openai_completion_params( diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index f84863385..804d7eab2 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -5,7 +5,7 @@ # the root directory of this source tree. -from typing import Any, AsyncGenerator, Dict, List, Optional, Union +from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union import httpx from ollama import AsyncClient @@ -39,7 +39,13 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) -from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletion, + OpenAIChatCompletionChunk, + OpenAICompletion, + OpenAIMessageParam, + OpenAIResponseFormatParam, +) from llama_stack.apis.models import Model, ModelType from llama_stack.log import get_logger from llama_stack.providers.datatypes import ( @@ -408,7 +414,7 @@ class OllamaInferenceAdapter( n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, @@ -419,7 +425,7 @@ class OllamaInferenceAdapter( top_logprobs: Optional[int] = None, top_p: Optional[float] = None, user: Optional[str] = None, - ) -> OpenAIChatCompletion: + ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]: model_obj = await self._get_model(model) params = { k: v diff --git a/llama_stack/providers/remote/inference/passthrough/passthrough.py b/llama_stack/providers/remote/inference/passthrough/passthrough.py index 0eb38c395..af05320b0 100644 --- a/llama_stack/providers/remote/inference/passthrough/passthrough.py +++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Any, AsyncGenerator, Dict, List, Optional, Union +from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union from llama_stack_client import AsyncLlamaStackClient @@ -26,7 +26,13 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) -from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletion, + OpenAIChatCompletionChunk, + OpenAICompletion, + OpenAIMessageParam, + OpenAIResponseFormatParam, +) from llama_stack.apis.models import Model from llama_stack.distribution.library_client import convert_pydantic_to_json_value, convert_to_pydantic from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper @@ -266,7 +272,7 @@ class PassthroughInferenceAdapter(Inference): n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, @@ -277,7 +283,7 @@ class PassthroughInferenceAdapter(Inference): top_logprobs: Optional[int] = None, top_p: Optional[float] = None, user: Optional[str] = None, - ) -> OpenAIChatCompletion: + ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]: client = self._get_client() model_obj = await self.model_store.get_model(model) diff --git a/llama_stack/providers/remote/inference/runpod/runpod.py b/llama_stack/providers/remote/inference/runpod/runpod.py index 878460122..72cbead9b 100644 --- a/llama_stack/providers/remote/inference/runpod/runpod.py +++ b/llama_stack/providers/remote/inference/runpod/runpod.py @@ -12,8 +12,8 @@ from llama_stack.apis.inference import * # noqa: F403 # from llama_stack.providers.datatypes import ModelsProtocolPrivate from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper from llama_stack.providers.utils.inference.openai_compat import ( - OpenAIChatCompletionUnsupportedMixin, - OpenAICompletionUnsupportedMixin, + OpenAIChatCompletionToLlamaStackMixin, + OpenAICompletionToLlamaStackMixin, get_sampling_options, process_chat_completion_response, process_chat_completion_stream_response, @@ -43,8 +43,8 @@ RUNPOD_SUPPORTED_MODELS = { class RunpodInferenceAdapter( ModelRegistryHelper, Inference, - OpenAIChatCompletionUnsupportedMixin, - OpenAICompletionUnsupportedMixin, + OpenAIChatCompletionToLlamaStackMixin, + OpenAICompletionToLlamaStackMixin, ): def __init__(self, config: RunpodImplConfig) -> None: ModelRegistryHelper.__init__(self, stack_to_provider_models_map=RUNPOD_SUPPORTED_MODELS) diff --git a/llama_stack/providers/remote/inference/sambanova/sambanova.py b/llama_stack/providers/remote/inference/sambanova/sambanova.py index c503657eb..1665e72b8 100644 --- a/llama_stack/providers/remote/inference/sambanova/sambanova.py +++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py @@ -42,8 +42,8 @@ from llama_stack.apis.inference import ( ) from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper from llama_stack.providers.utils.inference.openai_compat import ( - OpenAIChatCompletionUnsupportedMixin, - OpenAICompletionUnsupportedMixin, + OpenAIChatCompletionToLlamaStackMixin, + OpenAICompletionToLlamaStackMixin, process_chat_completion_stream_response, ) from llama_stack.providers.utils.inference.prompt_adapter import ( @@ -57,8 +57,8 @@ from .models import MODEL_ENTRIES class SambaNovaInferenceAdapter( ModelRegistryHelper, Inference, - OpenAIChatCompletionUnsupportedMixin, - OpenAICompletionUnsupportedMixin, + OpenAIChatCompletionToLlamaStackMixin, + OpenAICompletionToLlamaStackMixin, ): def __init__(self, config: SambaNovaImplConfig) -> None: ModelRegistryHelper.__init__(self, model_entries=MODEL_ENTRIES) diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py index 8f5b5e3cc..4ee386a15 100644 --- a/llama_stack/providers/remote/inference/tgi/tgi.py +++ b/llama_stack/providers/remote/inference/tgi/tgi.py @@ -40,10 +40,10 @@ from llama_stack.providers.utils.inference.model_registry import ( build_hf_repo_model_entry, ) from llama_stack.providers.utils.inference.openai_compat import ( - OpenAIChatCompletionUnsupportedMixin, + OpenAIChatCompletionToLlamaStackMixin, OpenAICompatCompletionChoice, OpenAICompatCompletionResponse, - OpenAICompletionUnsupportedMixin, + OpenAICompletionToLlamaStackMixin, get_sampling_options, process_chat_completion_response, process_chat_completion_stream_response, @@ -73,8 +73,8 @@ def build_hf_repo_model_entries(): class _HfAdapter( Inference, - OpenAIChatCompletionUnsupportedMixin, - OpenAICompletionUnsupportedMixin, + OpenAIChatCompletionToLlamaStackMixin, + OpenAICompletionToLlamaStackMixin, ModelsProtocolPrivate, ): client: AsyncInferenceClient diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py index 1615b8cd1..001e6aac4 100644 --- a/llama_stack/providers/remote/inference/together/together.py +++ b/llama_stack/providers/remote/inference/together/together.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Any, AsyncGenerator, Dict, List, Optional, Union +from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union from openai import AsyncOpenAI from together import AsyncTogether @@ -31,7 +31,13 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) -from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletion, + OpenAIChatCompletionChunk, + OpenAICompletion, + OpenAIMessageParam, + OpenAIResponseFormatParam, +) from llama_stack.distribution.request_headers import NeedsRequestProviderData from llama_stack.log import get_logger from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper @@ -315,7 +321,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, @@ -326,7 +332,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi top_logprobs: Optional[int] = None, top_p: Optional[float] = None, user: Optional[str] = None, - ) -> OpenAIChatCompletion: + ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]: model_obj = await self.model_store.get_model(model) params = await prepare_openai_completion_params( model=model_obj.provider_resource_id, @@ -353,4 +359,26 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi top_p=top_p, user=user, ) + if params.get("stream", True): + return self._stream_openai_chat_completion(params) return await self._get_openai_client().chat.completions.create(**params) # type: ignore + + async def _stream_openai_chat_completion(self, params: dict) -> AsyncGenerator: + # together.ai sometimes adds usage data to the stream, even if include_usage is False + # This causes an unexpected final chunk with empty choices array to be sent + # to clients that may not handle it gracefully. + include_usage = False + if params.get("stream_options", None): + include_usage = params["stream_options"].get("include_usage", False) + stream = await self._get_openai_client().chat.completions.create(**params) + + seen_finish_reason = False + async for chunk in stream: + # Final usage chunk with no choices that the user didn't request, so discard + if not include_usage and seen_finish_reason and len(chunk.choices) == 0: + break + yield chunk + for choice in chunk.choices: + if choice.finish_reason: + seen_finish_reason = True + break diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 0044d2e75..2b9eae1e9 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -5,7 +5,7 @@ # the root directory of this source tree. import json import logging -from typing import Any, AsyncGenerator, Dict, List, Optional, Union +from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union import httpx from openai import AsyncOpenAI @@ -45,7 +45,12 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) -from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletion, + OpenAICompletion, + OpenAIMessageParam, + OpenAIResponseFormatParam, +) from llama_stack.apis.models import Model, ModelType from llama_stack.models.llama.datatypes import BuiltinTool, StopReason, ToolCall from llama_stack.models.llama.sku_list import all_registered_models @@ -487,7 +492,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, @@ -498,7 +503,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): top_logprobs: Optional[int] = None, top_p: Optional[float] = None, user: Optional[str] = None, - ) -> OpenAIChatCompletion: + ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]: model_obj = await self._get_model(model) params = await prepare_openai_completion_params( model=model_obj.provider_resource_id, diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py index cd0f4ec67..efe7031f5 100644 --- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py +++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py @@ -30,7 +30,13 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) -from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletion, + OpenAIChatCompletionChunk, + OpenAICompletion, + OpenAIMessageParam, + OpenAIResponseFormatParam, +) from llama_stack.apis.models.models import Model from llama_stack.distribution.request_headers import NeedsRequestProviderData from llama_stack.log import get_logger @@ -270,7 +276,7 @@ class LiteLLMOpenAIMixin( guided_choice: Optional[List[str]] = None, prompt_logprobs: Optional[int] = None, ) -> OpenAICompletion: - model_obj = await self._get_model(model) + model_obj = await self.model_store.get_model(model) params = await prepare_openai_completion_params( model=model_obj.provider_resource_id, prompt=prompt, @@ -292,7 +298,7 @@ class LiteLLMOpenAIMixin( guided_choice=guided_choice, prompt_logprobs=prompt_logprobs, ) - return litellm.text_completion(**params) + return await litellm.atext_completion(**params) async def openai_chat_completion( self, @@ -308,7 +314,7 @@ class LiteLLMOpenAIMixin( n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, @@ -319,8 +325,8 @@ class LiteLLMOpenAIMixin( top_logprobs: Optional[int] = None, top_p: Optional[float] = None, user: Optional[str] = None, - ) -> OpenAIChatCompletion: - model_obj = await self._get_model(model) + ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]: + model_obj = await self.model_store.get_model(model) params = await prepare_openai_completion_params( model=model_obj.provider_resource_id, messages=messages, @@ -346,7 +352,7 @@ class LiteLLMOpenAIMixin( top_p=top_p, user=user, ) - return litellm.completion(**params) + return await litellm.acompletion(**params) async def batch_completion( self, diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py index f33cb4443..d98261abb 100644 --- a/llama_stack/providers/utils/inference/openai_compat.py +++ b/llama_stack/providers/utils/inference/openai_compat.py @@ -8,7 +8,7 @@ import logging import time import uuid import warnings -from typing import Any, AsyncGenerator, Dict, Iterable, List, Optional, Union +from typing import Any, AsyncGenerator, AsyncIterator, Awaitable, Dict, Iterable, List, Optional, Union from openai import AsyncStream from openai.types.chat import ( @@ -50,6 +50,18 @@ from openai.types.chat.chat_completion import ( from openai.types.chat.chat_completion import ( ChoiceLogprobs as OpenAIChoiceLogprobs, # same as chat_completion_chunk ChoiceLogprobs ) +from openai.types.chat.chat_completion_chunk import ( + Choice as OpenAIChatCompletionChunkChoice, +) +from openai.types.chat.chat_completion_chunk import ( + ChoiceDelta as OpenAIChoiceDelta, +) +from openai.types.chat.chat_completion_chunk import ( + ChoiceDeltaToolCall as OpenAIChoiceDeltaToolCall, +) +from openai.types.chat.chat_completion_chunk import ( + ChoiceDeltaToolCallFunction as OpenAIChoiceDeltaToolCallFunction, +) from openai.types.chat.chat_completion_content_part_image_param import ( ImageURL as OpenAIImageURL, ) @@ -59,6 +71,7 @@ from openai.types.chat.chat_completion_message_tool_call_param import ( from pydantic import BaseModel from llama_stack.apis.common.content_types import ( + URL, ImageContentItem, InterleavedContent, TextContentItem, @@ -85,12 +98,24 @@ from llama_stack.apis.inference import ( TopPSamplingStrategy, UserMessage, ) -from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAICompletionChoice +from llama_stack.apis.inference.inference import ( + JsonSchemaResponseFormat, + OpenAIChatCompletion, + OpenAICompletion, + OpenAICompletionChoice, + OpenAIMessageParam, + OpenAIResponseFormatParam, + ToolConfig, +) +from llama_stack.apis.inference.inference import ( + OpenAIChoice as OpenAIChatCompletionChoice, +) from llama_stack.models.llama.datatypes import ( BuiltinTool, StopReason, ToolCall, ToolDefinition, + ToolParamDefinition, ) from llama_stack.providers.utils.inference.prompt_adapter import ( convert_image_content_to_url, @@ -751,6 +776,17 @@ def convert_tooldef_to_openai_tool(tool: ToolDefinition) -> dict: return out +def _convert_stop_reason_to_openai_finish_reason(stop_reason: StopReason) -> str: + """ + Convert a StopReason to an OpenAI chat completion finish_reason. + """ + return { + StopReason.end_of_turn: "stop", + StopReason.end_of_message: "tool_calls", + StopReason.out_of_tokens: "length", + }.get(stop_reason, "stop") + + def _convert_openai_finish_reason(finish_reason: str) -> StopReason: """ Convert an OpenAI chat completion finish_reason to a StopReason. @@ -776,6 +812,56 @@ def _convert_openai_finish_reason(finish_reason: str) -> StopReason: }.get(finish_reason, StopReason.end_of_turn) +def _convert_openai_request_tool_config(tool_choice: Optional[Union[str, Dict[str, Any]]] = None) -> ToolConfig: + tool_config = ToolConfig() + if tool_choice: + tool_config.tool_choice = tool_choice + return tool_config + + +def _convert_openai_request_tools(tools: Optional[List[Dict[str, Any]]] = None) -> List[ToolDefinition]: + lls_tools = [] + if not tools: + return lls_tools + + for tool in tools: + tool_fn = tool.get("function", {}) + tool_name = tool_fn.get("name", None) + tool_desc = tool_fn.get("description", None) + + tool_params = tool_fn.get("parameters", None) + lls_tool_params = {} + if tool_params is not None: + tool_param_properties = tool_params.get("properties", {}) + for tool_param_key, tool_param_value in tool_param_properties.items(): + tool_param_def = ToolParamDefinition( + param_type=tool_param_value.get("type", None), + description=tool_param_value.get("description", None), + ) + lls_tool_params[tool_param_key] = tool_param_def + + lls_tool = ToolDefinition( + tool_name=tool_name, + description=tool_desc, + parameters=lls_tool_params, + ) + lls_tools.append(lls_tool) + return lls_tools + + +def _convert_openai_request_response_format(response_format: OpenAIResponseFormatParam = None): + if not response_format: + return None + # response_format can be a dict or a pydantic model + response_format = dict(response_format) + if response_format.get("type", "") == "json_schema": + return JsonSchemaResponseFormat( + type="json_schema", + json_schema=response_format.get("json_schema", {}).get("schema", ""), + ) + return None + + def _convert_openai_tool_calls( tool_calls: List[OpenAIChatCompletionMessageToolCall], ) -> List[ToolCall]: @@ -871,6 +957,40 @@ def _convert_openai_sampling_params( return sampling_params +def _convert_openai_request_messages(messages: List[OpenAIMessageParam]): + # Llama Stack messages and OpenAI messages are similar, but not identical. + lls_messages = [] + for message in messages: + lls_message = dict(message) + + # Llama Stack expects `call_id` but OpenAI uses `tool_call_id` + tool_call_id = lls_message.pop("tool_call_id", None) + if tool_call_id: + lls_message["call_id"] = tool_call_id + + content = lls_message.get("content", None) + if isinstance(content, list): + lls_content = [] + for item in content: + # items can either by pydantic models or dicts here... + item = dict(item) + if item.get("type", "") == "image_url": + lls_item = ImageContentItem( + type="image", + image=URL(uri=item.get("image_url", {}).get("url", "")), + ) + elif item.get("type", "") == "text": + lls_item = TextContentItem( + type="text", + text=item.get("text", ""), + ) + lls_content.append(lls_item) + lls_message["content"] = lls_content + lls_messages.append(lls_message) + + return lls_messages + + def convert_openai_chat_completion_choice( choice: OpenAIChoice, ) -> ChatCompletionResponse: @@ -1080,11 +1200,24 @@ async def convert_openai_chat_completion_stream( async def prepare_openai_completion_params(**params): - completion_params = {k: v for k, v in params.items() if v is not None} + async def _prepare_value(value: Any) -> Any: + new_value = value + if isinstance(value, list): + new_value = [await _prepare_value(v) for v in value] + elif isinstance(value, dict): + new_value = {k: await _prepare_value(v) for k, v in value.items()} + elif isinstance(value, BaseModel): + new_value = value.model_dump(exclude_none=True) + return new_value + + completion_params = {} + for k, v in params.items(): + if v is not None: + completion_params[k] = await _prepare_value(v) return completion_params -class OpenAICompletionUnsupportedMixin: +class OpenAICompletionToLlamaStackMixin: async def openai_completion( self, model: str, @@ -1122,6 +1255,7 @@ class OpenAICompletionUnsupportedMixin: choices = [] # "n" is the number of completions to generate per prompt + n = n or 1 for _i in range(0, n): # and we may have multiple prompts, if batching was used @@ -1134,7 +1268,7 @@ class OpenAICompletionUnsupportedMixin: index = len(choices) text = result.content - finish_reason = _convert_openai_finish_reason(result.stop_reason) + finish_reason = _convert_stop_reason_to_openai_finish_reason(result.stop_reason) choice = OpenAICompletionChoice( index=index, @@ -1152,7 +1286,7 @@ class OpenAICompletionUnsupportedMixin: ) -class OpenAIChatCompletionUnsupportedMixin: +class OpenAIChatCompletionToLlamaStackMixin: async def openai_chat_completion( self, model: str, @@ -1167,7 +1301,7 @@ class OpenAIChatCompletionUnsupportedMixin: n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, @@ -1178,5 +1312,103 @@ class OpenAIChatCompletionUnsupportedMixin: top_logprobs: Optional[int] = None, top_p: Optional[float] = None, user: Optional[str] = None, + ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]: + messages = _convert_openai_request_messages(messages) + response_format = _convert_openai_request_response_format(response_format) + sampling_params = _convert_openai_sampling_params( + max_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + ) + tool_config = _convert_openai_request_tool_config(tool_choice) + tools = _convert_openai_request_tools(tools) + + outstanding_responses = [] + # "n" is the number of completions to generate per prompt + n = n or 1 + for _i in range(0, n): + response = self.chat_completion( + model_id=model, + messages=messages, + sampling_params=sampling_params, + response_format=response_format, + stream=stream, + tool_config=tool_config, + tools=tools, + ) + outstanding_responses.append(response) + + if stream: + return OpenAIChatCompletionToLlamaStackMixin._process_stream_response(self, model, outstanding_responses) + + return await OpenAIChatCompletionToLlamaStackMixin._process_non_stream_response( + self, model, outstanding_responses + ) + + async def _process_stream_response( + self, model: str, outstanding_responses: List[Awaitable[AsyncIterator[ChatCompletionResponseStreamChunk]]] + ): + id = f"chatcmpl-{uuid.uuid4()}" + for outstanding_response in outstanding_responses: + response = await outstanding_response + i = 0 + async for chunk in response: + event = chunk.event + finish_reason = _convert_stop_reason_to_openai_finish_reason(event.stop_reason) + + if isinstance(event.delta, TextDelta): + text_delta = event.delta.text + delta = OpenAIChoiceDelta(content=text_delta) + yield OpenAIChatCompletionChunk( + id=id, + choices=[OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta)], + created=int(time.time()), + model=model, + object="chat.completion.chunk", + ) + elif isinstance(event.delta, ToolCallDelta): + if event.delta.parse_status == ToolCallParseStatus.succeeded: + tool_call = event.delta.tool_call + openai_tool_call = OpenAIChoiceDeltaToolCall( + index=0, + id=tool_call.call_id, + function=OpenAIChoiceDeltaToolCallFunction( + name=tool_call.tool_name, arguments=tool_call.arguments_json + ), + ) + delta = OpenAIChoiceDelta(tool_calls=[openai_tool_call]) + yield OpenAIChatCompletionChunk( + id=id, + choices=[ + OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta) + ], + created=int(time.time()), + model=model, + object="chat.completion.chunk", + ) + i = i + 1 + + async def _process_non_stream_response( + self, model: str, outstanding_responses: List[Awaitable[ChatCompletionResponse]] ) -> OpenAIChatCompletion: - raise ValueError(f"{self.__class__.__name__} doesn't support openai chat completion") + choices = [] + for outstanding_response in outstanding_responses: + response = await outstanding_response + completion_message = response.completion_message + message = await convert_message_to_openai_dict_new(completion_message) + finish_reason = _convert_stop_reason_to_openai_finish_reason(completion_message.stop_reason) + + choice = OpenAIChatCompletionChoice( + index=len(choices), + message=message, + finish_reason=finish_reason, + ) + choices.append(choice) + + return OpenAIChatCompletion( + id=f"chatcmpl-{uuid.uuid4()}", + choices=choices, + created=int(time.time()), + model=model, + object="chat.completion", + ) diff --git a/llama_stack/templates/dev/run.yaml b/llama_stack/templates/dev/run.yaml index ea3b7252a..0dd056405 100644 --- a/llama_stack/templates/dev/run.yaml +++ b/llama_stack/templates/dev/run.yaml @@ -386,6 +386,16 @@ models: provider_id: groq provider_model_id: groq/llama-4-scout-17b-16e-instruct model_type: llm +- metadata: {} + model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct + provider_id: groq + provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: groq + provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct + model_type: llm - metadata: {} model_id: groq/llama-4-maverick-17b-128e-instruct provider_id: groq @@ -396,6 +406,16 @@ models: provider_id: groq provider_model_id: groq/llama-4-maverick-17b-128e-instruct model_type: llm +- metadata: {} + model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct + provider_id: groq + provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: groq + provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct + model_type: llm - metadata: embedding_dimension: 384 model_id: all-MiniLM-L6-v2 diff --git a/llama_stack/templates/groq/run.yaml b/llama_stack/templates/groq/run.yaml index f557e64fd..444452dcb 100644 --- a/llama_stack/templates/groq/run.yaml +++ b/llama_stack/templates/groq/run.yaml @@ -158,6 +158,16 @@ models: provider_id: groq provider_model_id: groq/llama-4-scout-17b-16e-instruct model_type: llm +- metadata: {} + model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct + provider_id: groq + provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: groq + provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct + model_type: llm - metadata: {} model_id: groq/llama-4-maverick-17b-128e-instruct provider_id: groq @@ -168,6 +178,16 @@ models: provider_id: groq provider_model_id: groq/llama-4-maverick-17b-128e-instruct model_type: llm +- metadata: {} + model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct + provider_id: groq + provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: groq + provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct + model_type: llm - metadata: embedding_dimension: 384 model_id: all-MiniLM-L6-v2 diff --git a/llama_stack/templates/verification/run.yaml b/llama_stack/templates/verification/run.yaml index b6c2ca98d..454ecba5b 100644 --- a/llama_stack/templates/verification/run.yaml +++ b/llama_stack/templates/verification/run.yaml @@ -474,6 +474,16 @@ models: provider_id: groq-openai-compat provider_model_id: groq/llama-4-scout-17b-16e-instruct model_type: llm +- metadata: {} + model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct + provider_id: groq-openai-compat + provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: groq-openai-compat + provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct + model_type: llm - metadata: {} model_id: groq/llama-4-maverick-17b-128e-instruct provider_id: groq-openai-compat @@ -484,6 +494,16 @@ models: provider_id: groq-openai-compat provider_model_id: groq/llama-4-maverick-17b-128e-instruct model_type: llm +- metadata: {} + model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct + provider_id: groq-openai-compat + provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: groq-openai-compat + provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct + model_type: llm - metadata: {} model_id: Meta-Llama-3.1-8B-Instruct provider_id: sambanova-openai-compat diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py index 0905d5817..75b53100c 100644 --- a/tests/integration/inference/test_openai_completion.py +++ b/tests/integration/inference/test_openai_completion.py @@ -115,7 +115,7 @@ def test_openai_completion_streaming(openai_client, client_with_models, text_mod stream=True, max_tokens=50, ) - streamed_content = [chunk.choices[0].text for chunk in response] + streamed_content = [chunk.choices[0].text or "" for chunk in response] content_str = "".join(streamed_content).lower().strip() assert len(content_str) > 10 diff --git a/tests/verifications/conf/fireworks-llama-stack.yaml b/tests/verifications/conf/fireworks-llama-stack.yaml new file mode 100644 index 000000000..d91443dd9 --- /dev/null +++ b/tests/verifications/conf/fireworks-llama-stack.yaml @@ -0,0 +1,14 @@ +base_url: http://localhost:8321/v1/openai/v1 +api_key_var: FIREWORKS_API_KEY +models: +- fireworks/llama-v3p3-70b-instruct +- fireworks/llama4-scout-instruct-basic +- fireworks/llama4-maverick-instruct-basic +model_display_names: + fireworks/llama-v3p3-70b-instruct: Llama-3.3-70B-Instruct + fireworks/llama4-scout-instruct-basic: Llama-4-Scout-Instruct + fireworks/llama4-maverick-instruct-basic: Llama-4-Maverick-Instruct +test_exclusions: + fireworks/llama-v3p3-70b-instruct: + - test_chat_non_streaming_image + - test_chat_streaming_image diff --git a/tests/verifications/conf/groq-llama-stack.yaml b/tests/verifications/conf/groq-llama-stack.yaml new file mode 100644 index 000000000..fd5e9abec --- /dev/null +++ b/tests/verifications/conf/groq-llama-stack.yaml @@ -0,0 +1,14 @@ +base_url: http://localhost:8321/v1/openai/v1 +api_key_var: GROQ_API_KEY +models: +- groq/llama-3.3-70b-versatile +- groq/llama-4-scout-17b-16e-instruct +- groq/llama-4-maverick-17b-128e-instruct +model_display_names: + groq/llama-3.3-70b-versatile: Llama-3.3-70B-Instruct + groq/llama-4-scout-17b-16e-instruct: Llama-4-Scout-Instruct + groq/llama-4-maverick-17b-128e-instruct: Llama-4-Maverick-Instruct +test_exclusions: + groq/llama-3.3-70b-versatile: + - test_chat_non_streaming_image + - test_chat_streaming_image diff --git a/tests/verifications/conf/groq.yaml b/tests/verifications/conf/groq.yaml index 7871036dc..76b1244ae 100644 --- a/tests/verifications/conf/groq.yaml +++ b/tests/verifications/conf/groq.yaml @@ -2,12 +2,12 @@ base_url: https://api.groq.com/openai/v1 api_key_var: GROQ_API_KEY models: - llama-3.3-70b-versatile -- llama-4-scout-17b-16e-instruct -- llama-4-maverick-17b-128e-instruct +- meta-llama/llama-4-scout-17b-16e-instruct +- meta-llama/llama-4-maverick-17b-128e-instruct model_display_names: llama-3.3-70b-versatile: Llama-3.3-70B-Instruct - llama-4-scout-17b-16e-instruct: Llama-4-Scout-Instruct - llama-4-maverick-17b-128e-instruct: Llama-4-Maverick-Instruct + meta-llama/llama-4-scout-17b-16e-instruct: Llama-4-Scout-Instruct + meta-llama/llama-4-maverick-17b-128e-instruct: Llama-4-Maverick-Instruct test_exclusions: llama-3.3-70b-versatile: - test_chat_non_streaming_image diff --git a/tests/verifications/conf/openai-llama-stack.yaml b/tests/verifications/conf/openai-llama-stack.yaml new file mode 100644 index 000000000..de35439ae --- /dev/null +++ b/tests/verifications/conf/openai-llama-stack.yaml @@ -0,0 +1,9 @@ +base_url: http://localhost:8321/v1/openai/v1 +api_key_var: OPENAI_API_KEY +models: +- openai/gpt-4o +- openai/gpt-4o-mini +model_display_names: + openai/gpt-4o: gpt-4o + openai/gpt-4o-mini: gpt-4o-mini +test_exclusions: {} diff --git a/tests/verifications/conf/together-llama-stack.yaml b/tests/verifications/conf/together-llama-stack.yaml new file mode 100644 index 000000000..e49d82604 --- /dev/null +++ b/tests/verifications/conf/together-llama-stack.yaml @@ -0,0 +1,14 @@ +base_url: http://localhost:8321/v1/openai/v1 +api_key_var: TOGETHER_API_KEY +models: +- together/meta-llama/Llama-3.3-70B-Instruct-Turbo +- together/meta-llama/Llama-4-Scout-17B-16E-Instruct +- together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 +model_display_names: + together/meta-llama/Llama-3.3-70B-Instruct-Turbo: Llama-3.3-70B-Instruct + together/meta-llama/Llama-4-Scout-17B-16E-Instruct: Llama-4-Scout-Instruct + together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8: Llama-4-Maverick-Instruct +test_exclusions: + together/meta-llama/Llama-3.3-70B-Instruct-Turbo: + - test_chat_non_streaming_image + - test_chat_streaming_image diff --git a/tests/verifications/generate_report.py b/tests/verifications/generate_report.py index 6a7c39ee2..b39c3fd19 100755 --- a/tests/verifications/generate_report.py +++ b/tests/verifications/generate_report.py @@ -67,7 +67,17 @@ RESULTS_DIR.mkdir(exist_ok=True) # Maximum number of test result files to keep per provider MAX_RESULTS_PER_PROVIDER = 1 -PROVIDER_ORDER = ["together", "fireworks", "groq", "cerebras", "openai"] +PROVIDER_ORDER = [ + "together", + "fireworks", + "groq", + "cerebras", + "openai", + "together-llama-stack", + "fireworks-llama-stack", + "groq-llama-stack", + "openai-llama-stack", +] VERIFICATION_CONFIG = _load_all_verification_configs() diff --git a/tests/verifications/openai-api-verification-run.yaml b/tests/verifications/openai-api-verification-run.yaml new file mode 100644 index 000000000..71885d058 --- /dev/null +++ b/tests/verifications/openai-api-verification-run.yaml @@ -0,0 +1,146 @@ +version: '2' +image_name: openai-api-verification +apis: +- inference +- telemetry +- tool_runtime +- vector_io +providers: + inference: + - provider_id: together + provider_type: remote::together + config: + url: https://api.together.xyz/v1 + api_key: ${env.TOGETHER_API_KEY:} + - provider_id: fireworks + provider_type: remote::fireworks + config: + url: https://api.fireworks.ai/inference/v1 + api_key: ${env.FIREWORKS_API_KEY} + - provider_id: groq + provider_type: remote::groq + config: + url: https://api.groq.com + api_key: ${env.GROQ_API_KEY} + - provider_id: openai + provider_type: remote::openai + config: + url: https://api.openai.com/v1 + api_key: ${env.OPENAI_API_KEY:} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} + vector_io: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/openai}/faiss_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + service_name: "${env.OTEL_SERVICE_NAME:\u200B}" + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/openai/trace_store.db} + tool_runtime: + - provider_id: brave-search + provider_type: remote::brave-search + config: + api_key: ${env.BRAVE_SEARCH_API_KEY:} + max_results: 3 + - provider_id: tavily-search + provider_type: remote::tavily-search + config: + api_key: ${env.TAVILY_SEARCH_API_KEY:} + max_results: 3 + - provider_id: code-interpreter + provider_type: inline::code-interpreter + config: {} + - provider_id: rag-runtime + provider_type: inline::rag-runtime + config: {} + - provider_id: model-context-protocol + provider_type: remote::model-context-protocol + config: {} + - provider_id: wolfram-alpha + provider_type: remote::wolfram-alpha + config: + api_key: ${env.WOLFRAM_ALPHA_API_KEY:} +metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/openai}/registry.db +models: +- metadata: {} + model_id: together/meta-llama/Llama-3.3-70B-Instruct-Turbo + provider_id: together + provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: together/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: together + provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + model_type: llm +- metadata: {} + model_id: together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + provider_id: together + provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + model_type: llm +- metadata: {} + model_id: fireworks/llama-v3p3-70b-instruct + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct + model_type: llm +- metadata: {} + model_id: fireworks/llama4-scout-instruct-basic + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic + model_type: llm +- metadata: {} + model_id: fireworks/llama4-maverick-instruct-basic + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic + model_type: llm +- metadata: {} + model_id: groq/llama-3.3-70b-versatile + provider_id: groq + provider_model_id: groq/llama-3.3-70b-versatile + model_type: llm +- metadata: {} + model_id: groq/llama-4-scout-17b-16e-instruct + provider_id: groq + provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct + model_type: llm +- metadata: {} + model_id: groq/llama-4-maverick-17b-128e-instruct + provider_id: groq + provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct + model_type: llm +- metadata: {} + model_id: openai/gpt-4o + provider_id: openai + provider_model_id: openai/gpt-4o + model_type: llm +- metadata: {} + model_id: openai/gpt-4o-mini + provider_id: openai + provider_model_id: openai/gpt-4o-mini + model_type: llm +shields: [] +vector_dbs: [] +datasets: [] +scoring_fns: [] +benchmarks: [] +tool_groups: +- toolgroup_id: builtin::websearch + provider_id: tavily-search +- toolgroup_id: builtin::rag + provider_id: rag-runtime +- toolgroup_id: builtin::code_interpreter + provider_id: code-interpreter +- toolgroup_id: builtin::wolfram_alpha + provider_id: wolfram-alpha +server: + port: 8321 diff --git a/tests/verifications/openai_api/fixtures/fixtures.py b/tests/verifications/openai_api/fixtures/fixtures.py index 4f8c2e017..940b99b2a 100644 --- a/tests/verifications/openai_api/fixtures/fixtures.py +++ b/tests/verifications/openai_api/fixtures/fixtures.py @@ -99,6 +99,9 @@ def model_mapping(provider, providers_model_mapping): @pytest.fixture def openai_client(base_url, api_key): + # Simplify running against a local Llama Stack + if "localhost" in base_url and not api_key: + api_key = "empty" return OpenAI( base_url=base_url, api_key=api_key,