feat: OpenAI-Compatible models, completions, chat/completions (#1894)

# What does this PR do? This stubs in some OpenAI server-side compatibility with three new endpoints: /v1/openai/v1/models /v1/openai/v1/completions /v1/openai/v1/chat/completions This gives common inference apps using OpenAI clients the ability to talk to Llama Stack using an endpoint like http://localhost:8321/v1/openai/v1 . The two "v1" instances in there isn't awesome, but the thinking is that Llama Stack's API is v1 and then our OpenAI compatibility layer is compatible with OpenAI V1. And, some OpenAI clients implicitly assume the URL ends with "v1", so this gives maximum compatibility. The openai models endpoint is implemented in the routing layer, and just returns all the models Llama Stack knows about. The following providers should be working with the new OpenAI completions and chat/completions API: * remote::anthropic (untested) * remote::cerebras-openai-compat (untested) * remote::fireworks (tested) * remote::fireworks-openai-compat (untested) * remote::gemini (untested) * remote::groq-openai-compat (untested) * remote::nvidia (tested) * remote::ollama (tested) * remote::openai (untested) * remote::passthrough (untested) * remote::sambanova-openai-compat (untested) * remote::together (tested) * remote::together-openai-compat (untested) * remote::vllm (tested) The goal to support this for every inference provider - proxying directly to the provider's OpenAI endpoint for OpenAI-compatible providers. For providers that don't have an OpenAI-compatible API, we'll add a mixin to translate incoming OpenAI requests to Llama Stack inference requests and translate the Llama Stack inference responses to OpenAI responses. This is related to #1817 but is a bit larger in scope than just chat completions, as I have real use-cases that need the older completions API as well. ## Test Plan ### vLLM ``` VLLM_URL="http://localhost:8000/v1" INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" llama stack build --template remote-vllm --image-type venv --run LLAMA_STACK_CONFIG=http://localhost:8321 INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" python -m pytest -v tests/integration/inference/test_openai_completion.py --text-model "meta-llama/Llama-3.2-3B-Instruct" ``` ### ollama ``` INFERENCE_MODEL="llama3.2:3b-instruct-q8_0" llama stack build --template ollama --image-type venv --run LLAMA_STACK_CONFIG=http://localhost:8321 INFERENCE_MODEL="llama3.2:3b-instruct-q8_0" python -m pytest -v tests/integration/inference/test_openai_completion.py --text-model "llama3.2:3b-instruct-q8_0" ``` ## Documentation Run a Llama Stack distribution that uses one of the providers mentioned in the list above. Then, use your favorite OpenAI client to send completion or chat completion requests with the base_url set to http://localhost:8321/v1/openai/v1 . Replace "localhost:8321" with the host and port of your Llama Stack server, if different. --------- Signed-off-by: Ben Browning <bbrownin@redhat.com>
2025-04-11 16:14:17 -04:00 · 2025-04-11 16:14:17 -04:00 · 2b2db5fbda
commit 2b2db5fbda
parent 24d70cedca
27 changed files with 3265 additions and 20 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -3092,6 +3092,125 @@
                }
            }
        },
        "/v1/openai/v1/chat/completions": {
            "post": {
                "responses": {
                    "200": {
                        "description": "OK",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/OpenAIChatCompletion"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Inference"
                ],
                "description": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/OpenaiChatCompletionRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/openai/v1/completions": {
            "post": {
                "responses": {
                    "200": {
                        "description": "OK",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/OpenAICompletion"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Inference"
                ],
                "description": "Generate an OpenAI-compatible completion for the given prompt using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/OpenaiCompletionRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/openai/v1/models": {
            "get": {
                "responses": {
                    "200": {
                        "description": "OK",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/OpenAIListModelsResponse"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Models"
                ],
                "description": "",
                "parameters": []
            }
        },
        "/v1/post-training/preference-optimize": {
            "post": {
                "responses": {
@ -8713,6 +8832,819 @@
                ],
                "title": "LogEventRequest"
            },
            "OpenAIAssistantMessageParam": {
                "type": "object",
                "properties": {
                    "role": {
                        "type": "string",
                        "const": "assistant",
                        "default": "assistant",
                        "description": "Must be \"assistant\" to identify this as the model's response"
                    },
                    "content": {
                        "$ref": "#/components/schemas/InterleavedContent",
                        "description": "The content of the model's response"
                    },
                    "name": {
                        "type": "string",
                        "description": "(Optional) The name of the assistant message participant."
                    },
                    "tool_calls": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/ToolCall"
                        },
                        "description": "List of tool calls. Each tool call is a ToolCall object."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "role",
                    "content"
                ],
                "title": "OpenAIAssistantMessageParam",
                "description": "A message containing the model's (assistant) response in an OpenAI-compatible chat completion request."
            },
            "OpenAIDeveloperMessageParam": {
                "type": "object",
                "properties": {
                    "role": {
                        "type": "string",
                        "const": "developer",
                        "default": "developer",
                        "description": "Must be \"developer\" to identify this as a developer message"
                    },
                    "content": {
                        "$ref": "#/components/schemas/InterleavedContent",
                        "description": "The content of the developer message"
                    },
                    "name": {
                        "type": "string",
                        "description": "(Optional) The name of the developer message participant."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "role",
                    "content"
                ],
                "title": "OpenAIDeveloperMessageParam",
                "description": "A message from the developer in an OpenAI-compatible chat completion request."
            },
            "OpenAIMessageParam": {
                "oneOf": [
                    {
                        "$ref": "#/components/schemas/OpenAIUserMessageParam"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAISystemMessageParam"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIAssistantMessageParam"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIToolMessageParam"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIDeveloperMessageParam"
                    }
                ],
                "discriminator": {
                    "propertyName": "role",
                    "mapping": {
                        "user": "#/components/schemas/OpenAIUserMessageParam",
                        "system": "#/components/schemas/OpenAISystemMessageParam",
                        "assistant": "#/components/schemas/OpenAIAssistantMessageParam",
                        "tool": "#/components/schemas/OpenAIToolMessageParam",
                        "developer": "#/components/schemas/OpenAIDeveloperMessageParam"
                    }
                }
            },
            "OpenAISystemMessageParam": {
                "type": "object",
                "properties": {
                    "role": {
                        "type": "string",
                        "const": "system",
                        "default": "system",
                        "description": "Must be \"system\" to identify this as a system message"
                    },
                    "content": {
                        "$ref": "#/components/schemas/InterleavedContent",
                        "description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)."
                    },
                    "name": {
                        "type": "string",
                        "description": "(Optional) The name of the system message participant."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "role",
                    "content"
                ],
                "title": "OpenAISystemMessageParam",
                "description": "A system message providing instructions or context to the model."
            },
            "OpenAIToolMessageParam": {
                "type": "object",
                "properties": {
                    "role": {
                        "type": "string",
                        "const": "tool",
                        "default": "tool",
                        "description": "Must be \"tool\" to identify this as a tool response"
                    },
                    "tool_call_id": {
                        "type": "string",
                        "description": "Unique identifier for the tool call this response is for"
                    },
                    "content": {
                        "$ref": "#/components/schemas/InterleavedContent",
                        "description": "The response content from the tool"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "role",
                    "tool_call_id",
                    "content"
                ],
                "title": "OpenAIToolMessageParam",
                "description": "A message representing the result of a tool invocation in an OpenAI-compatible chat completion request."
            },
            "OpenAIUserMessageParam": {
                "type": "object",
                "properties": {
                    "role": {
                        "type": "string",
                        "const": "user",
                        "default": "user",
                        "description": "Must be \"user\" to identify this as a user message"
                    },
                    "content": {
                        "$ref": "#/components/schemas/InterleavedContent",
                        "description": "The content of the message, which can include text and other media"
                    },
                    "name": {
                        "type": "string",
                        "description": "(Optional) The name of the user message participant."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "role",
                    "content"
                ],
                "title": "OpenAIUserMessageParam",
                "description": "A message from the user in an OpenAI-compatible chat completion request."
            },
            "OpenaiChatCompletionRequest": {
                "type": "object",
                "properties": {
                    "model": {
                        "type": "string",
                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
                    },
                    "messages": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/OpenAIMessageParam"
                        },
                        "description": "List of messages in the conversation"
                    },
                    "frequency_penalty": {
                        "type": "number",
                        "description": "(Optional) The penalty for repeated tokens"
                    },
                    "function_call": {
                        "oneOf": [
                            {
                                "type": "string"
                            },
                            {
                                "type": "object",
                                "additionalProperties": {
                                    "oneOf": [
                                        {
                                            "type": "null"
                                        },
                                        {
                                            "type": "boolean"
                                        },
                                        {
                                            "type": "number"
                                        },
                                        {
                                            "type": "string"
                                        },
                                        {
                                            "type": "array"
                                        },
                                        {
                                            "type": "object"
                                        }
                                    ]
                                }
                            }
                        ],
                        "description": "(Optional) The function call to use"
                    },
                    "functions": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "additionalProperties": {
                                "oneOf": [
                                    {
                                        "type": "null"
                                    },
                                    {
                                        "type": "boolean"
                                    },
                                    {
                                        "type": "number"
                                    },
                                    {
                                        "type": "string"
                                    },
                                    {
                                        "type": "array"
                                    },
                                    {
                                        "type": "object"
                                    }
                                ]
                            }
                        },
                        "description": "(Optional) List of functions to use"
                    },
                    "logit_bias": {
                        "type": "object",
                        "additionalProperties": {
                            "type": "number"
                        },
                        "description": "(Optional) The logit bias to use"
                    },
                    "logprobs": {
                        "type": "boolean",
                        "description": "(Optional) The log probabilities to use"
                    },
                    "max_completion_tokens": {
                        "type": "integer",
                        "description": "(Optional) The maximum number of tokens to generate"
                    },
                    "max_tokens": {
                        "type": "integer",
                        "description": "(Optional) The maximum number of tokens to generate"
                    },
                    "n": {
                        "type": "integer",
                        "description": "(Optional) The number of completions to generate"
                    },
                    "parallel_tool_calls": {
                        "type": "boolean",
                        "description": "(Optional) Whether to parallelize tool calls"
                    },
                    "presence_penalty": {
                        "type": "number",
                        "description": "(Optional) The penalty for repeated tokens"
                    },
                    "response_format": {
                        "type": "object",
                        "additionalProperties": {
                            "type": "string"
                        },
                        "description": "(Optional) The response format to use"
                    },
                    "seed": {
                        "type": "integer",
                        "description": "(Optional) The seed to use"
                    },
                    "stop": {
                        "oneOf": [
                            {
                                "type": "string"
                            },
                            {
                                "type": "array",
                                "items": {
                                    "type": "string"
                                }
                            }
                        ],
                        "description": "(Optional) The stop tokens to use"
                    },
                    "stream": {
                        "type": "boolean",
                        "description": "(Optional) Whether to stream the response"
                    },
                    "stream_options": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "type": "null"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "string"
                                },
                                {
                                    "type": "array"
                                },
                                {
                                    "type": "object"
                                }
                            ]
                        },
                        "description": "(Optional) The stream options to use"
                    },
                    "temperature": {
                        "type": "number",
                        "description": "(Optional) The temperature to use"
                    },
                    "tool_choice": {
                        "oneOf": [
                            {
                                "type": "string"
                            },
                            {
                                "type": "object",
                                "additionalProperties": {
                                    "oneOf": [
                                        {
                                            "type": "null"
                                        },
                                        {
                                            "type": "boolean"
                                        },
                                        {
                                            "type": "number"
                                        },
                                        {
                                            "type": "string"
                                        },
                                        {
                                            "type": "array"
                                        },
                                        {
                                            "type": "object"
                                        }
                                    ]
                                }
                            }
                        ],
                        "description": "(Optional) The tool choice to use"
                    },
                    "tools": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "additionalProperties": {
                                "oneOf": [
                                    {
                                        "type": "null"
                                    },
                                    {
                                        "type": "boolean"
                                    },
                                    {
                                        "type": "number"
                                    },
                                    {
                                        "type": "string"
                                    },
                                    {
                                        "type": "array"
                                    },
                                    {
                                        "type": "object"
                                    }
                                ]
                            }
                        },
                        "description": "(Optional) The tools to use"
                    },
                    "top_logprobs": {
                        "type": "integer",
                        "description": "(Optional) The top log probabilities to use"
                    },
                    "top_p": {
                        "type": "number",
                        "description": "(Optional) The top p to use"
                    },
                    "user": {
                        "type": "string",
                        "description": "(Optional) The user to use"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "model",
                    "messages"
                ],
                "title": "OpenaiChatCompletionRequest"
            },
            "OpenAIChatCompletion": {
                "type": "object",
                "properties": {
                    "id": {
                        "type": "string",
                        "description": "The ID of the chat completion"
                    },
                    "choices": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/OpenAIChoice"
                        },
                        "description": "List of choices"
                    },
                    "object": {
                        "type": "string",
                        "const": "chat.completion",
                        "default": "chat.completion",
                        "description": "The object type, which will be \"chat.completion\""
                    },
                    "created": {
                        "type": "integer",
                        "description": "The Unix timestamp in seconds when the chat completion was created"
                    },
                    "model": {
                        "type": "string",
                        "description": "The model that was used to generate the chat completion"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "id",
                    "choices",
                    "object",
                    "created",
                    "model"
                ],
                "title": "OpenAIChatCompletion",
                "description": "Response from an OpenAI-compatible chat completion request."
            },
            "OpenAIChoice": {
                "type": "object",
                "properties": {
                    "message": {
                        "$ref": "#/components/schemas/OpenAIMessageParam",
                        "description": "The message from the model"
                    },
                    "finish_reason": {
                        "type": "string",
                        "description": "The reason the model stopped generating"
                    },
                    "index": {
                        "type": "integer"
                    },
                    "logprobs": {
                        "$ref": "#/components/schemas/OpenAIChoiceLogprobs"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "message",
                    "finish_reason",
                    "index"
                ],
                "title": "OpenAIChoice",
                "description": "A choice from an OpenAI-compatible chat completion response."
            },
            "OpenAIChoiceLogprobs": {
                "type": "object",
                "properties": {
                    "content": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/OpenAITokenLogProb"
                        }
                    },
                    "refusal": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/OpenAITokenLogProb"
                        }
                    }
                },
                "additionalProperties": false,
                "title": "OpenAIChoiceLogprobs",
                "description": "The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response."
            },
            "OpenAITokenLogProb": {
                "type": "object",
                "properties": {
                    "token": {
                        "type": "string"
                    },
                    "bytes": {
                        "type": "array",
                        "items": {
                            "type": "integer"
                        }
                    },
                    "logprob": {
                        "type": "number"
                    },
                    "top_logprobs": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/OpenAITopLogProb"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "token",
                    "logprob",
                    "top_logprobs"
                ],
                "title": "OpenAITokenLogProb",
                "description": "The log probability for a token from an OpenAI-compatible chat completion response."
            },
            "OpenAITopLogProb": {
                "type": "object",
                "properties": {
                    "token": {
                        "type": "string"
                    },
                    "bytes": {
                        "type": "array",
                        "items": {
                            "type": "integer"
                        }
                    },
                    "logprob": {
                        "type": "number"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "token",
                    "logprob"
                ],
                "title": "OpenAITopLogProb",
                "description": "The top log probability for a token from an OpenAI-compatible chat completion response."
            },
            "OpenaiCompletionRequest": {
                "type": "object",
                "properties": {
                    "model": {
                        "type": "string",
                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
                    },
                    "prompt": {
                        "oneOf": [
                            {
                                "type": "string"
                            },
                            {
                                "type": "array",
                                "items": {
                                    "type": "string"
                                }
                            },
                            {
                                "type": "array",
                                "items": {
                                    "type": "integer"
                                }
                            },
                            {
                                "type": "array",
                                "items": {
                                    "type": "array",
                                    "items": {
                                        "type": "integer"
                                    }
                                }
                            }
                        ],
                        "description": "The prompt to generate a completion for"
                    },
                    "best_of": {
                        "type": "integer",
                        "description": "(Optional) The number of completions to generate"
                    },
                    "echo": {
                        "type": "boolean",
                        "description": "(Optional) Whether to echo the prompt"
                    },
                    "frequency_penalty": {
                        "type": "number",
                        "description": "(Optional) The penalty for repeated tokens"
                    },
                    "logit_bias": {
                        "type": "object",
                        "additionalProperties": {
                            "type": "number"
                        },
                        "description": "(Optional) The logit bias to use"
                    },
                    "logprobs": {
                        "type": "boolean",
                        "description": "(Optional) The log probabilities to use"
                    },
                    "max_tokens": {
                        "type": "integer",
                        "description": "(Optional) The maximum number of tokens to generate"
                    },
                    "n": {
                        "type": "integer",
                        "description": "(Optional) The number of completions to generate"
                    },
                    "presence_penalty": {
                        "type": "number",
                        "description": "(Optional) The penalty for repeated tokens"
                    },
                    "seed": {
                        "type": "integer",
                        "description": "(Optional) The seed to use"
                    },
                    "stop": {
                        "oneOf": [
                            {
                                "type": "string"
                            },
                            {
                                "type": "array",
                                "items": {
                                    "type": "string"
                                }
                            }
                        ],
                        "description": "(Optional) The stop tokens to use"
                    },
                    "stream": {
                        "type": "boolean",
                        "description": "(Optional) Whether to stream the response"
                    },
                    "stream_options": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "type": "null"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "string"
                                },
                                {
                                    "type": "array"
                                },
                                {
                                    "type": "object"
                                }
                            ]
                        },
                        "description": "(Optional) The stream options to use"
                    },
                    "temperature": {
                        "type": "number",
                        "description": "(Optional) The temperature to use"
                    },
                    "top_p": {
                        "type": "number",
                        "description": "(Optional) The top p to use"
                    },
                    "user": {
                        "type": "string",
                        "description": "(Optional) The user to use"
                    },
                    "guided_choice": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        }
                    },
                    "prompt_logprobs": {
                        "type": "integer"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "model",
                    "prompt"
                ],
                "title": "OpenaiCompletionRequest"
            },
            "OpenAICompletion": {
                "type": "object",
                "properties": {
                    "id": {
                        "type": "string"
                    },
                    "choices": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/OpenAICompletionChoice"
                        }
                    },
                    "created": {
                        "type": "integer"
                    },
                    "model": {
                        "type": "string"
                    },
                    "object": {
                        "type": "string",
                        "const": "text_completion",
                        "default": "text_completion"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "id",
                    "choices",
                    "created",
                    "model",
                    "object"
                ],
                "title": "OpenAICompletion",
                "description": "Response from an OpenAI-compatible completion request."
            },
            "OpenAICompletionChoice": {
                "type": "object",
                "properties": {
                    "finish_reason": {
                        "type": "string"
                    },
                    "text": {
                        "type": "string"
                    },
                    "index": {
                        "type": "integer"
                    },
                    "logprobs": {
                        "$ref": "#/components/schemas/OpenAIChoiceLogprobs"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "finish_reason",
                    "text",
                    "index"
                ],
                "title": "OpenAICompletionChoice",
                "description": "A choice from an OpenAI-compatible completion response."
            },
            "OpenAIModel": {
                "type": "object",
                "properties": {
                    "id": {
                        "type": "string"
                    },
                    "object": {
                        "type": "string",
                        "const": "model",
                        "default": "model"
                    },
                    "created": {
                        "type": "integer"
                    },
                    "owned_by": {
                        "type": "string"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "id",
                    "object",
                    "created",
                    "owned_by"
                ],
                "title": "OpenAIModel",
                "description": "A model from OpenAI."
            },
            "OpenAIListModelsResponse": {
                "type": "object",
                "properties": {
                    "data": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/OpenAIModel"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "data"
                ],
                "title": "OpenAIListModelsResponse"
            },
            "DPOAlignmentConfig": {
                "type": "object",
                "properties": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -2131,6 +2131,91 @@ paths:
            schema:
              $ref: '#/components/schemas/LogEventRequest'
        required: true
  /v1/openai/v1/chat/completions:
    post:
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/OpenAIChatCompletion'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
      description: >-
        Generate an OpenAI-compatible chat completion for the given messages using
        the specified model.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/OpenaiChatCompletionRequest'
        required: true
  /v1/openai/v1/completions:
    post:
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/OpenAICompletion'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
      description: >-
        Generate an OpenAI-compatible completion for the given prompt using the specified
        model.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/OpenaiCompletionRequest'
        required: true
  /v1/openai/v1/models:
    get:
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/OpenAIListModelsResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Models
      description: ''
      parameters: []
  /v1/post-training/preference-optimize:
    post:
      responses:
@ -5980,6 +6065,586 @@ components:
        - event
        - ttl_seconds
      title: LogEventRequest
    OpenAIAssistantMessageParam:
      type: object
      properties:
        role:
          type: string
          const: assistant
          default: assistant
          description: >-
            Must be "assistant" to identify this as the model's response
        content:
          $ref: '#/components/schemas/InterleavedContent'
          description: The content of the model's response
        name:
          type: string
          description: >-
            (Optional) The name of the assistant message participant.
        tool_calls:
          type: array
          items:
            $ref: '#/components/schemas/ToolCall'
          description: >-
            List of tool calls. Each tool call is a ToolCall object.
      additionalProperties: false
      required:
        - role
        - content
      title: OpenAIAssistantMessageParam
      description: >-
        A message containing the model's (assistant) response in an OpenAI-compatible
        chat completion request.
    OpenAIDeveloperMessageParam:
      type: object
      properties:
        role:
          type: string
          const: developer
          default: developer
          description: >-
            Must be "developer" to identify this as a developer message
        content:
          $ref: '#/components/schemas/InterleavedContent'
          description: The content of the developer message
        name:
          type: string
          description: >-
            (Optional) The name of the developer message participant.
      additionalProperties: false
      required:
        - role
        - content
      title: OpenAIDeveloperMessageParam
      description: >-
        A message from the developer in an OpenAI-compatible chat completion request.
    OpenAIMessageParam:
      oneOf:
        - $ref: '#/components/schemas/OpenAIUserMessageParam'
        - $ref: '#/components/schemas/OpenAISystemMessageParam'
        - $ref: '#/components/schemas/OpenAIAssistantMessageParam'
        - $ref: '#/components/schemas/OpenAIToolMessageParam'
        - $ref: '#/components/schemas/OpenAIDeveloperMessageParam'
      discriminator:
        propertyName: role
        mapping:
          user: '#/components/schemas/OpenAIUserMessageParam'
          system: '#/components/schemas/OpenAISystemMessageParam'
          assistant: '#/components/schemas/OpenAIAssistantMessageParam'
          tool: '#/components/schemas/OpenAIToolMessageParam'
          developer: '#/components/schemas/OpenAIDeveloperMessageParam'
    OpenAISystemMessageParam:
      type: object
      properties:
        role:
          type: string
          const: system
          default: system
          description: >-
            Must be "system" to identify this as a system message
        content:
          $ref: '#/components/schemas/InterleavedContent'
          description: >-
            The content of the "system prompt". If multiple system messages are provided,
            they are concatenated. The underlying Llama Stack code may also add other
            system messages (for example, for formatting tool definitions).
        name:
          type: string
          description: >-
            (Optional) The name of the system message participant.
      additionalProperties: false
      required:
        - role
        - content
      title: OpenAISystemMessageParam
      description: >-
        A system message providing instructions or context to the model.
    OpenAIToolMessageParam:
      type: object
      properties:
        role:
          type: string
          const: tool
          default: tool
          description: >-
            Must be "tool" to identify this as a tool response
        tool_call_id:
          type: string
          description: >-
            Unique identifier for the tool call this response is for
        content:
          $ref: '#/components/schemas/InterleavedContent'
          description: The response content from the tool
      additionalProperties: false
      required:
        - role
        - tool_call_id
        - content
      title: OpenAIToolMessageParam
      description: >-
        A message representing the result of a tool invocation in an OpenAI-compatible
        chat completion request.
    OpenAIUserMessageParam:
      type: object
      properties:
        role:
          type: string
          const: user
          default: user
          description: >-
            Must be "user" to identify this as a user message
        content:
          $ref: '#/components/schemas/InterleavedContent'
          description: >-
            The content of the message, which can include text and other media
        name:
          type: string
          description: >-
            (Optional) The name of the user message participant.
      additionalProperties: false
      required:
        - role
        - content
      title: OpenAIUserMessageParam
      description: >-
        A message from the user in an OpenAI-compatible chat completion request.
    OpenaiChatCompletionRequest:
      type: object
      properties:
        model:
          type: string
          description: >-
            The identifier of the model to use. The model must be registered with
            Llama Stack and available via the /models endpoint.
        messages:
          type: array
          items:
            $ref: '#/components/schemas/OpenAIMessageParam'
          description: List of messages in the conversation
        frequency_penalty:
          type: number
          description: >-
            (Optional) The penalty for repeated tokens
        function_call:
          oneOf:
            - type: string
            - type: object
              additionalProperties:
                oneOf:
                  - type: 'null'
                  - type: boolean
                  - type: number
                  - type: string
                  - type: array
                  - type: object
          description: (Optional) The function call to use
        functions:
          type: array
          items:
            type: object
            additionalProperties:
              oneOf:
                - type: 'null'
                - type: boolean
                - type: number
                - type: string
                - type: array
                - type: object
          description: (Optional) List of functions to use
        logit_bias:
          type: object
          additionalProperties:
            type: number
          description: (Optional) The logit bias to use
        logprobs:
          type: boolean
          description: (Optional) The log probabilities to use
        max_completion_tokens:
          type: integer
          description: >-
            (Optional) The maximum number of tokens to generate
        max_tokens:
          type: integer
          description: >-
            (Optional) The maximum number of tokens to generate
        n:
          type: integer
          description: >-
            (Optional) The number of completions to generate
        parallel_tool_calls:
          type: boolean
          description: >-
            (Optional) Whether to parallelize tool calls
        presence_penalty:
          type: number
          description: >-
            (Optional) The penalty for repeated tokens
        response_format:
          type: object
          additionalProperties:
            type: string
          description: (Optional) The response format to use
        seed:
          type: integer
          description: (Optional) The seed to use
        stop:
          oneOf:
            - type: string
            - type: array
              items:
                type: string
          description: (Optional) The stop tokens to use
        stream:
          type: boolean
          description: >-
            (Optional) Whether to stream the response
        stream_options:
          type: object
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
          description: (Optional) The stream options to use
        temperature:
          type: number
          description: (Optional) The temperature to use
        tool_choice:
          oneOf:
            - type: string
            - type: object
              additionalProperties:
                oneOf:
                  - type: 'null'
                  - type: boolean
                  - type: number
                  - type: string
                  - type: array
                  - type: object
          description: (Optional) The tool choice to use
        tools:
          type: array
          items:
            type: object
            additionalProperties:
              oneOf:
                - type: 'null'
                - type: boolean
                - type: number
                - type: string
                - type: array
                - type: object
          description: (Optional) The tools to use
        top_logprobs:
          type: integer
          description: >-
            (Optional) The top log probabilities to use
        top_p:
          type: number
          description: (Optional) The top p to use
        user:
          type: string
          description: (Optional) The user to use
      additionalProperties: false
      required:
        - model
        - messages
      title: OpenaiChatCompletionRequest
    OpenAIChatCompletion:
      type: object
      properties:
        id:
          type: string
          description: The ID of the chat completion
        choices:
          type: array
          items:
            $ref: '#/components/schemas/OpenAIChoice'
          description: List of choices
        object:
          type: string
          const: chat.completion
          default: chat.completion
          description: >-
            The object type, which will be "chat.completion"
        created:
          type: integer
          description: >-
            The Unix timestamp in seconds when the chat completion was created
        model:
          type: string
          description: >-
            The model that was used to generate the chat completion
      additionalProperties: false
      required:
        - id
        - choices
        - object
        - created
        - model
      title: OpenAIChatCompletion
      description: >-
        Response from an OpenAI-compatible chat completion request.
    OpenAIChoice:
      type: object
      properties:
        message:
          $ref: '#/components/schemas/OpenAIMessageParam'
          description: The message from the model
        finish_reason:
          type: string
          description: The reason the model stopped generating
        index:
          type: integer
        logprobs:
          $ref: '#/components/schemas/OpenAIChoiceLogprobs'
      additionalProperties: false
      required:
        - message
        - finish_reason
        - index
      title: OpenAIChoice
      description: >-
        A choice from an OpenAI-compatible chat completion response.
    OpenAIChoiceLogprobs:
      type: object
      properties:
        content:
          type: array
          items:
            $ref: '#/components/schemas/OpenAITokenLogProb'
        refusal:
          type: array
          items:
            $ref: '#/components/schemas/OpenAITokenLogProb'
      additionalProperties: false
      title: OpenAIChoiceLogprobs
      description: >-
        The log probabilities for the tokens in the message from an OpenAI-compatible
        chat completion response.
    OpenAITokenLogProb:
      type: object
      properties:
        token:
          type: string
        bytes:
          type: array
          items:
            type: integer
        logprob:
          type: number
        top_logprobs:
          type: array
          items:
            $ref: '#/components/schemas/OpenAITopLogProb'
      additionalProperties: false
      required:
        - token
        - logprob
        - top_logprobs
      title: OpenAITokenLogProb
      description: >-
        The log probability for a token from an OpenAI-compatible chat completion
        response.
    OpenAITopLogProb:
      type: object
      properties:
        token:
          type: string
        bytes:
          type: array
          items:
            type: integer
        logprob:
          type: number
      additionalProperties: false
      required:
        - token
        - logprob
      title: OpenAITopLogProb
      description: >-
        The top log probability for a token from an OpenAI-compatible chat completion
        response.
    OpenaiCompletionRequest:
      type: object
      properties:
        model:
          type: string
          description: >-
            The identifier of the model to use. The model must be registered with
            Llama Stack and available via the /models endpoint.
        prompt:
          oneOf:
            - type: string
            - type: array
              items:
                type: string
            - type: array
              items:
                type: integer
            - type: array
              items:
                type: array
                items:
                  type: integer
          description: The prompt to generate a completion for
        best_of:
          type: integer
          description: >-
            (Optional) The number of completions to generate
        echo:
          type: boolean
          description: (Optional) Whether to echo the prompt
        frequency_penalty:
          type: number
          description: >-
            (Optional) The penalty for repeated tokens
        logit_bias:
          type: object
          additionalProperties:
            type: number
          description: (Optional) The logit bias to use
        logprobs:
          type: boolean
          description: (Optional) The log probabilities to use
        max_tokens:
          type: integer
          description: >-
            (Optional) The maximum number of tokens to generate
        n:
          type: integer
          description: >-
            (Optional) The number of completions to generate
        presence_penalty:
          type: number
          description: >-
            (Optional) The penalty for repeated tokens
        seed:
          type: integer
          description: (Optional) The seed to use
        stop:
          oneOf:
            - type: string
            - type: array
              items:
                type: string
          description: (Optional) The stop tokens to use
        stream:
          type: boolean
          description: >-
            (Optional) Whether to stream the response
        stream_options:
          type: object
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
          description: (Optional) The stream options to use
        temperature:
          type: number
          description: (Optional) The temperature to use
        top_p:
          type: number
          description: (Optional) The top p to use
        user:
          type: string
          description: (Optional) The user to use
        guided_choice:
          type: array
          items:
            type: string
        prompt_logprobs:
          type: integer
      additionalProperties: false
      required:
        - model
        - prompt
      title: OpenaiCompletionRequest
    OpenAICompletion:
      type: object
      properties:
        id:
          type: string
        choices:
          type: array
          items:
            $ref: '#/components/schemas/OpenAICompletionChoice'
        created:
          type: integer
        model:
          type: string
        object:
          type: string
          const: text_completion
          default: text_completion
      additionalProperties: false
      required:
        - id
        - choices
        - created
        - model
        - object
      title: OpenAICompletion
      description: >-
        Response from an OpenAI-compatible completion request.
    OpenAICompletionChoice:
      type: object
      properties:
        finish_reason:
          type: string
        text:
          type: string
        index:
          type: integer
        logprobs:
          $ref: '#/components/schemas/OpenAIChoiceLogprobs'
      additionalProperties: false
      required:
        - finish_reason
        - text
        - index
      title: OpenAICompletionChoice
      description: >-
        A choice from an OpenAI-compatible completion response.
    OpenAIModel:
      type: object
      properties:
        id:
          type: string
        object:
          type: string
          const: model
          default: model
        created:
          type: integer
        owned_by:
          type: string
      additionalProperties: false
      required:
        - id
        - object
        - created
        - owned_by
      title: OpenAIModel
      description: A model from OpenAI.
    OpenAIListModelsResponse:
      type: object
      properties:
        data:
          type: array
          items:
            $ref: '#/components/schemas/OpenAIModel'
      additionalProperties: false
      required:
        - data
      title: OpenAIListModelsResponse
    DPOAlignmentConfig:
      type: object
      properties:
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -442,6 +442,217 @@ class EmbeddingsResponse(BaseModel):
    embeddings: List[List[float]]
@json_schema_type
 class OpenAIUserMessageParam(BaseModel):
    """A message from the user in an OpenAI-compatible chat completion request.
    :param role: Must be "user" to identify this as a user message
    :param content: The content of the message, which can include text and other media
    :param name: (Optional) The name of the user message participant.
    """
    role: Literal["user"] = "user"
    content: InterleavedContent
    name: Optional[str] = None
@json_schema_type
 class OpenAISystemMessageParam(BaseModel):
    """A system message providing instructions or context to the model.
    :param role: Must be "system" to identify this as a system message
    :param content: The content of the "system prompt". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions).
    :param name: (Optional) The name of the system message participant.
    """
    role: Literal["system"] = "system"
    content: InterleavedContent
    name: Optional[str] = None
@json_schema_type
 class OpenAIAssistantMessageParam(BaseModel):
    """A message containing the model's (assistant) response in an OpenAI-compatible chat completion request.
    :param role: Must be "assistant" to identify this as the model's response
    :param content: The content of the model's response
    :param name: (Optional) The name of the assistant message participant.
    :param tool_calls: List of tool calls. Each tool call is a ToolCall object.
    """
    role: Literal["assistant"] = "assistant"
    content: InterleavedContent
    name: Optional[str] = None
    tool_calls: Optional[List[ToolCall]] = Field(default_factory=list)
@json_schema_type
 class OpenAIToolMessageParam(BaseModel):
    """A message representing the result of a tool invocation in an OpenAI-compatible chat completion request.
    :param role: Must be "tool" to identify this as a tool response
    :param tool_call_id: Unique identifier for the tool call this response is for
    :param content: The response content from the tool
    """
    role: Literal["tool"] = "tool"
    tool_call_id: str
    content: InterleavedContent
@json_schema_type
 class OpenAIDeveloperMessageParam(BaseModel):
    """A message from the developer in an OpenAI-compatible chat completion request.
    :param role: Must be "developer" to identify this as a developer message
    :param content: The content of the developer message
    :param name: (Optional) The name of the developer message participant.
    """
    role: Literal["developer"] = "developer"
    content: InterleavedContent
    name: Optional[str] = None
 OpenAIMessageParam = Annotated[
    Union[
        OpenAIUserMessageParam,
        OpenAISystemMessageParam,
        OpenAIAssistantMessageParam,
        OpenAIToolMessageParam,
        OpenAIDeveloperMessageParam,
    ],
    Field(discriminator="role"),
 ]
 register_schema(OpenAIMessageParam, name="OpenAIMessageParam")
@json_schema_type
 class OpenAITopLogProb(BaseModel):
    """The top log probability for a token from an OpenAI-compatible chat completion response.
    :token: The token
    :bytes: (Optional) The bytes for the token
    :logprob: The log probability of the token
    """
    token: str
    bytes: Optional[List[int]] = None
    logprob: float
@json_schema_type
 class OpenAITokenLogProb(BaseModel):
    """The log probability for a token from an OpenAI-compatible chat completion response.
    :token: The token
    :bytes: (Optional) The bytes for the token
    :logprob: The log probability of the token
    :top_logprobs: The top log probabilities for the token
    """
    token: str
    bytes: Optional[List[int]] = None
    logprob: float
    top_logprobs: List[OpenAITopLogProb]
@json_schema_type
 class OpenAIChoiceLogprobs(BaseModel):
    """The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response.
    :content: (Optional) The log probabilities for the tokens in the message
    :refusal: (Optional) The log probabilities for the tokens in the message
    """
    content: Optional[List[OpenAITokenLogProb]] = None
    refusal: Optional[List[OpenAITokenLogProb]] = None
@json_schema_type
 class OpenAIChoice(BaseModel):
    """A choice from an OpenAI-compatible chat completion response.
    :param message: The message from the model
    :param finish_reason: The reason the model stopped generating
    :index: The index of the choice
    :logprobs: (Optional) The log probabilities for the tokens in the message
    """
    message: OpenAIMessageParam
    finish_reason: str
    index: int
    logprobs: Optional[OpenAIChoiceLogprobs] = None
@json_schema_type
 class OpenAIChatCompletion(BaseModel):
    """Response from an OpenAI-compatible chat completion request.
    :param id: The ID of the chat completion
    :param choices: List of choices
    :param object: The object type, which will be "chat.completion"
    :param created: The Unix timestamp in seconds when the chat completion was created
    :param model: The model that was used to generate the chat completion
    """
    id: str
    choices: List[OpenAIChoice]
    object: Literal["chat.completion"] = "chat.completion"
    created: int
    model: str
@json_schema_type
 class OpenAICompletionLogprobs(BaseModel):
    """The log probabilities for the tokens in the message from an OpenAI-compatible completion response.
    :text_offset: (Optional) The offset of the token in the text
    :token_logprobs: (Optional) The log probabilities for the tokens
    :tokens: (Optional) The tokens
    :top_logprobs: (Optional) The top log probabilities for the tokens
    """
    text_offset: Optional[List[int]] = None
    token_logprobs: Optional[List[float]] = None
    tokens: Optional[List[str]] = None
    top_logprobs: Optional[List[Dict[str, float]]] = None
@json_schema_type
 class OpenAICompletionChoice(BaseModel):
    """A choice from an OpenAI-compatible completion response.
    :finish_reason: The reason the model stopped generating
    :text: The text of the choice
    :index: The index of the choice
    :logprobs: (Optional) The log probabilities for the tokens in the choice
    """
    finish_reason: str
    text: str
    index: int
    logprobs: Optional[OpenAIChoiceLogprobs] = None
@json_schema_type
 class OpenAICompletion(BaseModel):
    """Response from an OpenAI-compatible completion request.
    :id: The ID of the completion
    :choices: List of choices
    :created: The Unix timestamp in seconds when the completion was created
    :model: The model that was used to generate the completion
    :object: The object type, which will be "text_completion"
    """
    id: str
    choices: List[OpenAICompletionChoice]
    created: int
    model: str
    object: Literal["text_completion"] = "text_completion"
 class ModelStore(Protocol):
    async def get_model(self, identifier: str) -> Model: ...
@ -564,3 +775,105 @@ class Inference(Protocol):
        :returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
        """
        ...
    @webmethod(route="/openai/v1/completions", method="POST")
    async def openai_completion(
        self,
        # Standard OpenAI completion parameters
        model: str,
        prompt: Union[str, List[str], List[int], List[List[int]]],
        best_of: Optional[int] = None,
        echo: Optional[bool] = None,
        frequency_penalty: Optional[float] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        presence_penalty: Optional[float] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
        # vLLM-specific parameters
        guided_choice: Optional[List[str]] = None,
        prompt_logprobs: Optional[int] = None,
    ) -> OpenAICompletion:
        """Generate an OpenAI-compatible completion for the given prompt using the specified model.
        :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
        :param prompt: The prompt to generate a completion for
        :param best_of: (Optional) The number of completions to generate
        :param echo: (Optional) Whether to echo the prompt
        :param frequency_penalty: (Optional) The penalty for repeated tokens
        :param logit_bias: (Optional) The logit bias to use
        :param logprobs: (Optional) The log probabilities to use
        :param max_tokens: (Optional) The maximum number of tokens to generate
        :param n: (Optional) The number of completions to generate
        :param presence_penalty: (Optional) The penalty for repeated tokens
        :param seed: (Optional) The seed to use
        :param stop: (Optional) The stop tokens to use
        :param stream: (Optional) Whether to stream the response
        :param stream_options: (Optional) The stream options to use
        :param temperature: (Optional) The temperature to use
        :param top_p: (Optional) The top p to use
        :param user: (Optional) The user to use
        """
        ...
    @webmethod(route="/openai/v1/chat/completions", method="POST")
    async def openai_chat_completion(
        self,
        model: str,
        messages: List[OpenAIMessageParam],
        frequency_penalty: Optional[float] = None,
        function_call: Optional[Union[str, Dict[str, Any]]] = None,
        functions: Optional[List[Dict[str, Any]]] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_completion_tokens: Optional[int] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        parallel_tool_calls: Optional[bool] = None,
        presence_penalty: Optional[float] = None,
        response_format: Optional[Dict[str, str]] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
        tools: Optional[List[Dict[str, Any]]] = None,
        top_logprobs: Optional[int] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
    ) -> OpenAIChatCompletion:
        """Generate an OpenAI-compatible chat completion for the given messages using the specified model.
        :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
        :param messages: List of messages in the conversation
        :param frequency_penalty: (Optional) The penalty for repeated tokens
        :param function_call: (Optional) The function call to use
        :param functions: (Optional) List of functions to use
        :param logit_bias: (Optional) The logit bias to use
        :param logprobs: (Optional) The log probabilities to use
        :param max_completion_tokens: (Optional) The maximum number of tokens to generate
        :param max_tokens: (Optional) The maximum number of tokens to generate
        :param n: (Optional) The number of completions to generate
        :param parallel_tool_calls: (Optional) Whether to parallelize tool calls
        :param presence_penalty: (Optional) The penalty for repeated tokens
        :param response_format: (Optional) The response format to use
        :param seed: (Optional) The seed to use
        :param stop: (Optional) The stop tokens to use
        :param stream: (Optional) Whether to stream the response
        :param stream_options: (Optional) The stream options to use
        :param temperature: (Optional) The temperature to use
        :param tool_choice: (Optional) The tool choice to use
        :param tools: (Optional) The tools to use
        :param top_logprobs: (Optional) The top log probabilities to use
        :param top_p: (Optional) The top p to use
        :param user: (Optional) The user to use
        """
        ...
--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@ -56,12 +56,35 @@ class ListModelsResponse(BaseModel):
    data: List[Model]
@json_schema_type
 class OpenAIModel(BaseModel):
    """A model from OpenAI.
    :id: The ID of the model
    :object: The object type, which will be "model"
    :created: The Unix timestamp in seconds when the model was created
    :owned_by: The owner of the model
    """
    id: str
    object: Literal["model"] = "model"
    created: int
    owned_by: str
 class OpenAIListModelsResponse(BaseModel):
    data: List[OpenAIModel]
@runtime_checkable
@trace_protocol
 class Models(Protocol):
    @webmethod(route="/models", method="GET")
    async def list_models(self) -> ListModelsResponse: ...
    @webmethod(route="/openai/v1/models", method="GET")
    async def openai_list_models(self) -> OpenAIListModelsResponse: ...
    @webmethod(route="/models/{model_id:path}", method="GET")
    async def get_model(
        self,
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@ -35,6 +35,7 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
 from llama_stack.apis.models import Model, ModelType
 from llama_stack.apis.safety import RunShieldResponse, Safety
 from llama_stack.apis.scoring import (
@ -419,6 +420,126 @@ class InferenceRouter(Inference):
            task_type=task_type,
        )
    async def openai_completion(
        self,
        model: str,
        prompt: Union[str, List[str], List[int], List[List[int]]],
        best_of: Optional[int] = None,
        echo: Optional[bool] = None,
        frequency_penalty: Optional[float] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        presence_penalty: Optional[float] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
        guided_choice: Optional[List[str]] = None,
        prompt_logprobs: Optional[int] = None,
    ) -> OpenAICompletion:
        logger.debug(
            f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}",
        )
        model_obj = await self.routing_table.get_model(model)
        if model_obj is None:
            raise ValueError(f"Model '{model}' not found")
        if model_obj.model_type == ModelType.embedding:
            raise ValueError(f"Model '{model}' is an embedding model and does not support completions")
        params = dict(
            model=model_obj.identifier,
            prompt=prompt,
            best_of=best_of,
            echo=echo,
            frequency_penalty=frequency_penalty,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_tokens=max_tokens,
            n=n,
            presence_penalty=presence_penalty,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            top_p=top_p,
            user=user,
            guided_choice=guided_choice,
            prompt_logprobs=prompt_logprobs,
        )
        provider = self.routing_table.get_provider_impl(model_obj.identifier)
        return await provider.openai_completion(**params)
    async def openai_chat_completion(
        self,
        model: str,
        messages: List[OpenAIMessageParam],
        frequency_penalty: Optional[float] = None,
        function_call: Optional[Union[str, Dict[str, Any]]] = None,
        functions: Optional[List[Dict[str, Any]]] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_completion_tokens: Optional[int] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        parallel_tool_calls: Optional[bool] = None,
        presence_penalty: Optional[float] = None,
        response_format: Optional[Dict[str, str]] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
        tools: Optional[List[Dict[str, Any]]] = None,
        top_logprobs: Optional[int] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
    ) -> OpenAIChatCompletion:
        logger.debug(
            f"InferenceRouter.openai_chat_completion: {model=}, {stream=}, {messages=}",
        )
        model_obj = await self.routing_table.get_model(model)
        if model_obj is None:
            raise ValueError(f"Model '{model}' not found")
        if model_obj.model_type == ModelType.embedding:
            raise ValueError(f"Model '{model}' is an embedding model and does not support chat completions")
        params = dict(
            model=model_obj.identifier,
            messages=messages,
            frequency_penalty=frequency_penalty,
            function_call=function_call,
            functions=functions,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_completion_tokens=max_completion_tokens,
            max_tokens=max_tokens,
            n=n,
            parallel_tool_calls=parallel_tool_calls,
            presence_penalty=presence_penalty,
            response_format=response_format,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            tool_choice=tool_choice,
            tools=tools,
            top_logprobs=top_logprobs,
            top_p=top_p,
            user=user,
        )
        provider = self.routing_table.get_provider_impl(model_obj.identifier)
        return await provider.openai_chat_completion(**params)
 class SafetyRouter(Safety):
    def __init__(
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@ -5,6 +5,7 @@
 # the root directory of this source tree.
 import logging
 import time
 import uuid
 from typing import Any, Dict, List, Optional
@ -23,7 +24,7 @@ from llama_stack.apis.datasets import (
    RowsDataSource,
    URIDataSource,
 )
-from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType
+from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType, OpenAIListModelsResponse, OpenAIModel
 from llama_stack.apis.resource import ResourceType
 from llama_stack.apis.scoring_functions import (
    ListScoringFunctionsResponse,
@ -254,6 +255,19 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
    async def list_models(self) -> ListModelsResponse:
        return ListModelsResponse(data=await self.get_all_with_type("model"))
    async def openai_list_models(self) -> OpenAIListModelsResponse:
        models = await self.get_all_with_type("model")
        openai_models = [
            OpenAIModel(
                id=model.identifier,
                object="model",
                created=int(time.time()),
                owned_by="llama_stack",
            )
            for model in models
        ]
        return OpenAIListModelsResponse(data=openai_models)
    async def get_model(self, model_id: str) -> Model:
        model = await self.get_object_by_identifier("model", model_id)
        if model is None:
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -54,6 +54,10 @@ from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
    build_hf_repo_model_entry,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
    OpenAIChatCompletionUnsupportedMixin,
    OpenAICompletionUnsupportedMixin,
 )
 from llama_stack.providers.utils.inference.prompt_adapter import (
    augment_content_with_response_format_prompt,
    chat_completion_request_to_messages,
@ -79,6 +83,8 @@ def llama4_builder_fn(config: MetaReferenceInferenceConfig, model_id: str, llama
 class MetaReferenceInferenceImpl(
    OpenAICompletionUnsupportedMixin,
    OpenAIChatCompletionUnsupportedMixin,
    SentenceTransformerEmbeddingMixin,
    Inference,
    ModelsProtocolPrivate,
--- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@ -23,6 +23,10 @@ from llama_stack.providers.datatypes import Model, ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.embedding_mixin import (
    SentenceTransformerEmbeddingMixin,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
    OpenAIChatCompletionUnsupportedMixin,
    OpenAICompletionUnsupportedMixin,
 )
 from .config import SentenceTransformersInferenceConfig
@ -30,6 +34,8 @@ log = logging.getLogger(__name__)
 class SentenceTransformersInferenceImpl(
    OpenAIChatCompletionUnsupportedMixin,
    OpenAICompletionUnsupportedMixin,
    SentenceTransformerEmbeddingMixin,
    Inference,
    ModelsProtocolPrivate,
--- a/llama_stack/providers/inline/inference/vllm/vllm.py
+++ b/llama_stack/providers/inline/inference/vllm/vllm.py
@ -66,8 +66,10 @@ from llama_stack.providers.utils.inference.model_registry import (
    ModelsProtocolPrivate,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
    OpenAIChatCompletionUnsupportedMixin,
    OpenAICompatCompletionChoice,
    OpenAICompatCompletionResponse,
    OpenAICompletionUnsupportedMixin,
    get_stop_reason,
    process_chat_completion_stream_response,
 )
@ -172,7 +174,12 @@ def _convert_sampling_params(
    return vllm_sampling_params
-class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
+class VLLMInferenceImpl(
    Inference,
    OpenAIChatCompletionUnsupportedMixin,
    OpenAICompletionUnsupportedMixin,
    ModelsProtocolPrivate,
 ):
    """
    vLLM-based inference model adapter for Llama Stack with support for multiple models.
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@ -36,8 +36,10 @@ from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
    OpenAIChatCompletionUnsupportedMixin,
    OpenAICompatCompletionChoice,
    OpenAICompatCompletionResponse,
    OpenAICompletionUnsupportedMixin,
    get_sampling_strategy_options,
    process_chat_completion_response,
    process_chat_completion_stream_response,
@ -51,7 +53,12 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 from .models import MODEL_ENTRIES
-class BedrockInferenceAdapter(ModelRegistryHelper, Inference):
+class BedrockInferenceAdapter(
    ModelRegistryHelper,
    Inference,
    OpenAIChatCompletionUnsupportedMixin,
    OpenAICompletionUnsupportedMixin,
 ):
    def __init__(self, config: BedrockConfig) -> None:
        ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
        self._config = config
--- a/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py
@ -34,6 +34,8 @@ from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
    OpenAIChatCompletionUnsupportedMixin,
    OpenAICompletionUnsupportedMixin,
    get_sampling_options,
    process_chat_completion_response,
    process_chat_completion_stream_response,
@ -49,7 +51,12 @@ from .config import CerebrasImplConfig
 from .models import MODEL_ENTRIES
-class CerebrasInferenceAdapter(ModelRegistryHelper, Inference):
+class CerebrasInferenceAdapter(
    ModelRegistryHelper,
    Inference,
    OpenAIChatCompletionUnsupportedMixin,
    OpenAICompletionUnsupportedMixin,
 ):
    def __init__(self, config: CerebrasImplConfig) -> None:
        ModelRegistryHelper.__init__(
            self,
--- a/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/llama_stack/providers/remote/inference/databricks/databricks.py
@ -34,6 +34,8 @@ from llama_stack.providers.utils.inference.model_registry import (
    build_hf_repo_model_entry,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
    OpenAIChatCompletionUnsupportedMixin,
    OpenAICompletionUnsupportedMixin,
    get_sampling_options,
    process_chat_completion_response,
    process_chat_completion_stream_response,
@ -56,7 +58,12 @@ model_entries = [
 ]
-class DatabricksInferenceAdapter(ModelRegistryHelper, Inference):
+class DatabricksInferenceAdapter(
    ModelRegistryHelper,
    Inference,
    OpenAIChatCompletionUnsupportedMixin,
    OpenAICompletionUnsupportedMixin,
 ):
    def __init__(self, config: DatabricksImplConfig) -> None:
        ModelRegistryHelper.__init__(self, model_entries=model_entries)
        self.config = config
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -4,9 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import AsyncGenerator, List, Optional, Union
+from typing import Any, AsyncGenerator, Dict, List, Optional, Union
 from fireworks.client import Fireworks
 from openai import AsyncOpenAI
 from llama_stack.apis.common.content_types import (
    InterleavedContent,
@ -31,6 +32,7 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.model_registry import (
@ -39,6 +41,7 @@ from llama_stack.providers.utils.inference.model_registry import (
 from llama_stack.providers.utils.inference.openai_compat import (
    convert_message_to_openai_dict,
    get_sampling_options,
    prepare_openai_completion_params,
    process_chat_completion_response,
    process_chat_completion_stream_response,
    process_completion_response,
@ -81,10 +84,16 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
                )
            return provider_data.fireworks_api_key
    def _get_base_url(self) -> str:
        return "https://api.fireworks.ai/inference/v1"
    def _get_client(self) -> Fireworks:
        fireworks_api_key = self._get_api_key()
        return Fireworks(api_key=fireworks_api_key)
    def _get_openai_client(self) -> AsyncOpenAI:
        return AsyncOpenAI(base_url=self._get_base_url(), api_key=self._get_api_key())
    async def completion(
        self,
        model_id: str,
@ -268,3 +277,101 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
        embeddings = [data.embedding for data in response.data]
        return EmbeddingsResponse(embeddings=embeddings)
    async def openai_completion(
        self,
        model: str,
        prompt: Union[str, List[str], List[int], List[List[int]]],
        best_of: Optional[int] = None,
        echo: Optional[bool] = None,
        frequency_penalty: Optional[float] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        presence_penalty: Optional[float] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
        guided_choice: Optional[List[str]] = None,
        prompt_logprobs: Optional[int] = None,
    ) -> OpenAICompletion:
        model_obj = await self.model_store.get_model(model)
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
            prompt=prompt,
            best_of=best_of,
            echo=echo,
            frequency_penalty=frequency_penalty,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_tokens=max_tokens,
            n=n,
            presence_penalty=presence_penalty,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            top_p=top_p,
            user=user,
        )
        return await self._get_openai_client().completions.create(**params)
    async def openai_chat_completion(
        self,
        model: str,
        messages: List[OpenAIMessageParam],
        frequency_penalty: Optional[float] = None,
        function_call: Optional[Union[str, Dict[str, Any]]] = None,
        functions: Optional[List[Dict[str, Any]]] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_completion_tokens: Optional[int] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        parallel_tool_calls: Optional[bool] = None,
        presence_penalty: Optional[float] = None,
        response_format: Optional[Dict[str, str]] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
        tools: Optional[List[Dict[str, Any]]] = None,
        top_logprobs: Optional[int] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
    ) -> OpenAIChatCompletion:
        model_obj = await self.model_store.get_model(model)
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
            messages=messages,
            frequency_penalty=frequency_penalty,
            function_call=function_call,
            functions=functions,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_completion_tokens=max_completion_tokens,
            max_tokens=max_tokens,
            n=n,
            parallel_tool_calls=parallel_tool_calls,
            presence_penalty=presence_penalty,
            response_format=response_format,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            tool_choice=tool_choice,
            tools=tools,
            top_logprobs=top_logprobs,
            top_p=top_p,
            user=user,
        )
        return await self._get_openai_client().chat.completions.create(**params)
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@ -7,7 +7,7 @@
 import logging
 import warnings
 from functools import lru_cache
-from typing import AsyncIterator, List, Optional, Union
+from typing import Any, AsyncIterator, Dict, List, Optional, Union
 from openai import APIConnectionError, AsyncOpenAI, BadRequestError
@ -35,6 +35,7 @@ from llama_stack.apis.inference import (
    ToolConfig,
    ToolDefinition,
 )
 from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
 from llama_stack.models.llama.datatypes import ToolPromptFormat
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
@ -42,6 +43,7 @@ from llama_stack.providers.utils.inference.model_registry import (
 from llama_stack.providers.utils.inference.openai_compat import (
    convert_openai_chat_completion_choice,
    convert_openai_chat_completion_stream,
    prepare_openai_completion_params,
 )
 from llama_stack.providers.utils.inference.prompt_adapter import content_has_media
@ -263,3 +265,111 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
        else:
            # we pass n=1 to get only one completion
            return convert_openai_chat_completion_choice(response.choices[0])
    async def openai_completion(
        self,
        model: str,
        prompt: Union[str, List[str], List[int], List[List[int]]],
        best_of: Optional[int] = None,
        echo: Optional[bool] = None,
        frequency_penalty: Optional[float] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        presence_penalty: Optional[float] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
        guided_choice: Optional[List[str]] = None,
        prompt_logprobs: Optional[int] = None,
    ) -> OpenAICompletion:
        provider_model_id = self.get_provider_model_id(model)
        params = await prepare_openai_completion_params(
            model=provider_model_id,
            prompt=prompt,
            best_of=best_of,
            echo=echo,
            frequency_penalty=frequency_penalty,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_tokens=max_tokens,
            n=n,
            presence_penalty=presence_penalty,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            top_p=top_p,
            user=user,
        )
        try:
            return await self._get_client(provider_model_id).completions.create(**params)
        except APIConnectionError as e:
            raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e
    async def openai_chat_completion(
        self,
        model: str,
        messages: List[OpenAIMessageParam],
        frequency_penalty: Optional[float] = None,
        function_call: Optional[Union[str, Dict[str, Any]]] = None,
        functions: Optional[List[Dict[str, Any]]] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_completion_tokens: Optional[int] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        parallel_tool_calls: Optional[bool] = None,
        presence_penalty: Optional[float] = None,
        response_format: Optional[Dict[str, str]] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
        tools: Optional[List[Dict[str, Any]]] = None,
        top_logprobs: Optional[int] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
    ) -> OpenAIChatCompletion:
        provider_model_id = self.get_provider_model_id(model)
        params = await prepare_openai_completion_params(
            model=provider_model_id,
            messages=messages,
            frequency_penalty=frequency_penalty,
            function_call=function_call,
            functions=functions,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_completion_tokens=max_completion_tokens,
            max_tokens=max_tokens,
            n=n,
            parallel_tool_calls=parallel_tool_calls,
            presence_penalty=presence_penalty,
            response_format=response_format,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            tool_choice=tool_choice,
            tools=tools,
            top_logprobs=top_logprobs,
            top_p=top_p,
            user=user,
        )
        try:
            return await self._get_client(provider_model_id).chat.completions.create(**params)
        except APIConnectionError as e:
            raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -5,10 +5,11 @@
 # the root directory of this source tree.
-from typing import Any, AsyncGenerator, List, Optional, Union
+from typing import Any, AsyncGenerator, Dict, List, Optional, Union
 import httpx
 from ollama import AsyncClient
 from openai import AsyncOpenAI
 from llama_stack.apis.common.content_types import (
    ImageContentItem,
@ -38,6 +39,7 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
 from llama_stack.apis.models import Model, ModelType
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import ModelsProtocolPrivate
@ -67,7 +69,10 @@ from .models import model_entries
 logger = get_logger(name=__name__, category="inference")
-class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
+class OllamaInferenceAdapter(
    Inference,
    ModelsProtocolPrivate,
 ):
    def __init__(self, url: str) -> None:
        self.register_helper = ModelRegistryHelper(model_entries)
        self.url = url
@ -76,6 +81,10 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
    def client(self) -> AsyncClient:
        return AsyncClient(host=self.url)
    @property
    def openai_client(self) -> AsyncOpenAI:
        return AsyncOpenAI(base_url=f"{self.url}/v1", api_key="ollama")
    async def initialize(self) -> None:
        logger.info(f"checking connectivity to Ollama at `{self.url}`...")
        try:
@ -319,6 +328,115 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
        return model
    async def openai_completion(
        self,
        model: str,
        prompt: Union[str, List[str], List[int], List[List[int]]],
        best_of: Optional[int] = None,
        echo: Optional[bool] = None,
        frequency_penalty: Optional[float] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        presence_penalty: Optional[float] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
        guided_choice: Optional[List[str]] = None,
        prompt_logprobs: Optional[int] = None,
    ) -> OpenAICompletion:
        if not isinstance(prompt, str):
            raise ValueError("Ollama does not support non-string prompts for completion")
        model_obj = await self._get_model(model)
        params = {
            k: v
            for k, v in {
                "model": model_obj.provider_resource_id,
                "prompt": prompt,
                "best_of": best_of,
                "echo": echo,
                "frequency_penalty": frequency_penalty,
                "logit_bias": logit_bias,
                "logprobs": logprobs,
                "max_tokens": max_tokens,
                "n": n,
                "presence_penalty": presence_penalty,
                "seed": seed,
                "stop": stop,
                "stream": stream,
                "stream_options": stream_options,
                "temperature": temperature,
                "top_p": top_p,
                "user": user,
            }.items()
            if v is not None
        }
        return await self.openai_client.completions.create(**params)  # type: ignore
    async def openai_chat_completion(
        self,
        model: str,
        messages: List[OpenAIMessageParam],
        frequency_penalty: Optional[float] = None,
        function_call: Optional[Union[str, Dict[str, Any]]] = None,
        functions: Optional[List[Dict[str, Any]]] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_completion_tokens: Optional[int] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        parallel_tool_calls: Optional[bool] = None,
        presence_penalty: Optional[float] = None,
        response_format: Optional[Dict[str, str]] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
        tools: Optional[List[Dict[str, Any]]] = None,
        top_logprobs: Optional[int] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
    ) -> OpenAIChatCompletion:
        model_obj = await self._get_model(model)
        params = {
            k: v
            for k, v in {
                "model": model_obj.provider_resource_id,
                "messages": messages,
                "frequency_penalty": frequency_penalty,
                "function_call": function_call,
                "functions": functions,
                "logit_bias": logit_bias,
                "logprobs": logprobs,
                "max_completion_tokens": max_completion_tokens,
                "max_tokens": max_tokens,
                "n": n,
                "parallel_tool_calls": parallel_tool_calls,
                "presence_penalty": presence_penalty,
                "response_format": response_format,
                "seed": seed,
                "stop": stop,
                "stream": stream,
                "stream_options": stream_options,
                "temperature": temperature,
                "tool_choice": tool_choice,
                "tools": tools,
                "top_logprobs": top_logprobs,
                "top_p": top_p,
                "user": user,
            }.items()
            if v is not None
        }
        return await self.openai_client.chat.completions.create(**params)  # type: ignore
 async def convert_message_to_openai_dict_for_ollama(message: Message) -> List[dict]:
    async def _convert_content(content) -> dict:
--- a/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, AsyncGenerator, Dict, List, Optional
+from typing import Any, AsyncGenerator, Dict, List, Optional, Union
 from llama_stack_client import AsyncLlamaStackClient
@ -26,9 +26,11 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
 from llama_stack.apis.models import Model
 from llama_stack.distribution.library_client import convert_pydantic_to_json_value, convert_to_pydantic
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
 from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
 from .config import PassthroughImplConfig
@ -201,6 +203,112 @@ class PassthroughInferenceAdapter(Inference):
            task_type=task_type,
        )
    async def openai_completion(
        self,
        model: str,
        prompt: Union[str, List[str], List[int], List[List[int]]],
        best_of: Optional[int] = None,
        echo: Optional[bool] = None,
        frequency_penalty: Optional[float] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        presence_penalty: Optional[float] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
        guided_choice: Optional[List[str]] = None,
        prompt_logprobs: Optional[int] = None,
    ) -> OpenAICompletion:
        client = self._get_client()
        model_obj = await self.model_store.get_model(model)
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
            prompt=prompt,
            best_of=best_of,
            echo=echo,
            frequency_penalty=frequency_penalty,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_tokens=max_tokens,
            n=n,
            presence_penalty=presence_penalty,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            top_p=top_p,
            user=user,
            guided_choice=guided_choice,
            prompt_logprobs=prompt_logprobs,
        )
        return await client.inference.openai_completion(**params)
    async def openai_chat_completion(
        self,
        model: str,
        messages: List[OpenAIMessageParam],
        frequency_penalty: Optional[float] = None,
        function_call: Optional[Union[str, Dict[str, Any]]] = None,
        functions: Optional[List[Dict[str, Any]]] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_completion_tokens: Optional[int] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        parallel_tool_calls: Optional[bool] = None,
        presence_penalty: Optional[float] = None,
        response_format: Optional[Dict[str, str]] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
        tools: Optional[List[Dict[str, Any]]] = None,
        top_logprobs: Optional[int] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
    ) -> OpenAIChatCompletion:
        client = self._get_client()
        model_obj = await self.model_store.get_model(model)
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
            messages=messages,
            frequency_penalty=frequency_penalty,
            function_call=function_call,
            functions=functions,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_completion_tokens=max_completion_tokens,
            max_tokens=max_tokens,
            n=n,
            parallel_tool_calls=parallel_tool_calls,
            presence_penalty=presence_penalty,
            response_format=response_format,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            tool_choice=tool_choice,
            tools=tools,
            top_logprobs=top_logprobs,
            top_p=top_p,
            user=user,
        )
        return await client.inference.openai_chat_completion(**params)
    def cast_value_to_json_dict(self, request_params: Dict[str, Any]) -> Dict[str, Any]:
        json_params = {}
        for key, value in request_params.items():
--- a/llama_stack/providers/remote/inference/runpod/runpod.py
+++ b/llama_stack/providers/remote/inference/runpod/runpod.py
@ -12,6 +12,8 @@ from llama_stack.apis.inference import *  # noqa: F403
 # from llama_stack.providers.datatypes import ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
 from llama_stack.providers.utils.inference.openai_compat import (
    OpenAIChatCompletionUnsupportedMixin,
    OpenAICompletionUnsupportedMixin,
    get_sampling_options,
    process_chat_completion_response,
    process_chat_completion_stream_response,
@ -38,7 +40,12 @@ RUNPOD_SUPPORTED_MODELS = {
 }
-class RunpodInferenceAdapter(ModelRegistryHelper, Inference):
+class RunpodInferenceAdapter(
    ModelRegistryHelper,
    Inference,
    OpenAIChatCompletionUnsupportedMixin,
    OpenAICompletionUnsupportedMixin,
 ):
    def __init__(self, config: RunpodImplConfig) -> None:
        ModelRegistryHelper.__init__(self, stack_to_provider_models_map=RUNPOD_SUPPORTED_MODELS)
        self.config = config
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@ -42,6 +42,8 @@ from llama_stack.apis.inference import (
 )
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
 from llama_stack.providers.utils.inference.openai_compat import (
    OpenAIChatCompletionUnsupportedMixin,
    OpenAICompletionUnsupportedMixin,
    process_chat_completion_stream_response,
 )
 from llama_stack.providers.utils.inference.prompt_adapter import (
@ -52,7 +54,12 @@ from .config import SambaNovaImplConfig
 from .models import MODEL_ENTRIES
-class SambaNovaInferenceAdapter(ModelRegistryHelper, Inference):
+class SambaNovaInferenceAdapter(
    ModelRegistryHelper,
    Inference,
    OpenAIChatCompletionUnsupportedMixin,
    OpenAICompletionUnsupportedMixin,
 ):
    def __init__(self, config: SambaNovaImplConfig) -> None:
        ModelRegistryHelper.__init__(self, model_entries=MODEL_ENTRIES)
        self.config = config
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@ -40,8 +40,10 @@ from llama_stack.providers.utils.inference.model_registry import (
    build_hf_repo_model_entry,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
    OpenAIChatCompletionUnsupportedMixin,
    OpenAICompatCompletionChoice,
    OpenAICompatCompletionResponse,
    OpenAICompletionUnsupportedMixin,
    get_sampling_options,
    process_chat_completion_response,
    process_chat_completion_stream_response,
@ -69,7 +71,12 @@ def build_hf_repo_model_entries():
    ]
-class _HfAdapter(Inference, ModelsProtocolPrivate):
+class _HfAdapter(
    Inference,
    OpenAIChatCompletionUnsupportedMixin,
    OpenAICompletionUnsupportedMixin,
    ModelsProtocolPrivate,
 ):
    client: AsyncInferenceClient
    max_tokens: int
    model_id: str
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@ -4,8 +4,9 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import AsyncGenerator, List, Optional, Union
+from typing import Any, AsyncGenerator, Dict, List, Optional, Union
 from openai import AsyncOpenAI
 from together import AsyncTogether
 from llama_stack.apis.common.content_types import (
@ -30,12 +31,14 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
 from llama_stack.providers.utils.inference.openai_compat import (
    convert_message_to_openai_dict,
    get_sampling_options,
    prepare_openai_completion_params,
    process_chat_completion_response,
    process_chat_completion_stream_response,
    process_completion_response,
@ -60,6 +63,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
        ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
        self.config = config
        self._client = None
        self._openai_client = None
    async def initialize(self) -> None:
        pass
@ -110,6 +114,15 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
            self._client = AsyncTogether(api_key=together_api_key)
        return self._client
    def _get_openai_client(self) -> AsyncOpenAI:
        if not self._openai_client:
            together_client = self._get_client().client
            self._openai_client = AsyncOpenAI(
                base_url=together_client.base_url,
                api_key=together_client.api_key,
            )
        return self._openai_client
    async def _nonstream_completion(self, request: CompletionRequest) -> ChatCompletionResponse:
        params = await self._get_params(request)
        client = self._get_client()
@ -243,3 +256,101 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
        )
        embeddings = [item.embedding for item in r.data]
        return EmbeddingsResponse(embeddings=embeddings)
    async def openai_completion(
        self,
        model: str,
        prompt: Union[str, List[str], List[int], List[List[int]]],
        best_of: Optional[int] = None,
        echo: Optional[bool] = None,
        frequency_penalty: Optional[float] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        presence_penalty: Optional[float] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
        guided_choice: Optional[List[str]] = None,
        prompt_logprobs: Optional[int] = None,
    ) -> OpenAICompletion:
        model_obj = await self.model_store.get_model(model)
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
            prompt=prompt,
            best_of=best_of,
            echo=echo,
            frequency_penalty=frequency_penalty,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_tokens=max_tokens,
            n=n,
            presence_penalty=presence_penalty,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            top_p=top_p,
            user=user,
        )
        return await self._get_openai_client().completions.create(**params)  # type: ignore
    async def openai_chat_completion(
        self,
        model: str,
        messages: List[OpenAIMessageParam],
        frequency_penalty: Optional[float] = None,
        function_call: Optional[Union[str, Dict[str, Any]]] = None,
        functions: Optional[List[Dict[str, Any]]] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_completion_tokens: Optional[int] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        parallel_tool_calls: Optional[bool] = None,
        presence_penalty: Optional[float] = None,
        response_format: Optional[Dict[str, str]] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
        tools: Optional[List[Dict[str, Any]]] = None,
        top_logprobs: Optional[int] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
    ) -> OpenAIChatCompletion:
        model_obj = await self.model_store.get_model(model)
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
            messages=messages,
            frequency_penalty=frequency_penalty,
            function_call=function_call,
            functions=functions,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_completion_tokens=max_completion_tokens,
            max_tokens=max_tokens,
            n=n,
            parallel_tool_calls=parallel_tool_calls,
            presence_penalty=presence_penalty,
            response_format=response_format,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            tool_choice=tool_choice,
            tools=tools,
            top_logprobs=top_logprobs,
            top_p=top_p,
            user=user,
        )
        return await self._get_openai_client().chat.completions.create(**params)  # type: ignore
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.
 import json
 import logging
-from typing import Any, AsyncGenerator, List, Optional, Union
+from typing import Any, AsyncGenerator, Dict, List, Optional, Union
 import httpx
 from openai import AsyncOpenAI
@ -45,6 +45,7 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
 from llama_stack.apis.models import Model, ModelType
 from llama_stack.models.llama.datatypes import BuiltinTool, StopReason, ToolCall
 from llama_stack.models.llama.sku_list import all_registered_models
@ -58,6 +59,7 @@ from llama_stack.providers.utils.inference.openai_compat import (
    convert_message_to_openai_dict,
    convert_tool_call,
    get_sampling_options,
    prepare_openai_completion_params,
    process_chat_completion_stream_response,
    process_completion_response,
    process_completion_stream_response,
@ -418,3 +420,109 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        embeddings = [data.embedding for data in response.data]
        return EmbeddingsResponse(embeddings=embeddings)
    async def openai_completion(
        self,
        model: str,
        prompt: Union[str, List[str], List[int], List[List[int]]],
        best_of: Optional[int] = None,
        echo: Optional[bool] = None,
        frequency_penalty: Optional[float] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        presence_penalty: Optional[float] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
        guided_choice: Optional[List[str]] = None,
        prompt_logprobs: Optional[int] = None,
    ) -> OpenAICompletion:
        model_obj = await self._get_model(model)
        extra_body: Dict[str, Any] = {}
        if prompt_logprobs is not None and prompt_logprobs >= 0:
            extra_body["prompt_logprobs"] = prompt_logprobs
        if guided_choice:
            extra_body["guided_choice"] = guided_choice
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
            prompt=prompt,
            best_of=best_of,
            echo=echo,
            frequency_penalty=frequency_penalty,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_tokens=max_tokens,
            n=n,
            presence_penalty=presence_penalty,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            top_p=top_p,
            user=user,
            extra_body=extra_body,
        )
        return await self.client.completions.create(**params)  # type: ignore
    async def openai_chat_completion(
        self,
        model: str,
        messages: List[OpenAIMessageParam],
        frequency_penalty: Optional[float] = None,
        function_call: Optional[Union[str, Dict[str, Any]]] = None,
        functions: Optional[List[Dict[str, Any]]] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_completion_tokens: Optional[int] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        parallel_tool_calls: Optional[bool] = None,
        presence_penalty: Optional[float] = None,
        response_format: Optional[Dict[str, str]] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
        tools: Optional[List[Dict[str, Any]]] = None,
        top_logprobs: Optional[int] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
    ) -> OpenAIChatCompletion:
        model_obj = await self._get_model(model)
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
            messages=messages,
            frequency_penalty=frequency_penalty,
            function_call=function_call,
            functions=functions,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_completion_tokens=max_completion_tokens,
            max_tokens=max_tokens,
            n=n,
            parallel_tool_calls=parallel_tool_calls,
            presence_penalty=presence_penalty,
            response_format=response_format,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            tool_choice=tool_choice,
            tools=tools,
            top_logprobs=top_logprobs,
            top_p=top_p,
            user=user,
        )
        return await self.client.chat.completions.create(**params)  # type: ignore
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import AsyncGenerator, AsyncIterator, List, Optional, Union
+from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
 import litellm
@ -30,6 +30,7 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
 from llama_stack.apis.models.models import Model
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.log import get_logger
@ -40,6 +41,7 @@ from llama_stack.providers.utils.inference.openai_compat import (
    convert_openai_chat_completion_stream,
    convert_tooldef_to_openai_tool,
    get_sampling_options,
    prepare_openai_completion_params,
 )
 from llama_stack.providers.utils.inference.prompt_adapter import (
    interleaved_content_as_str,
@ -245,3 +247,103 @@ class LiteLLMOpenAIMixin(
        embeddings = [data["embedding"] for data in response["data"]]
        return EmbeddingsResponse(embeddings=embeddings)
    async def openai_completion(
        self,
        model: str,
        prompt: Union[str, List[str], List[int], List[List[int]]],
        best_of: Optional[int] = None,
        echo: Optional[bool] = None,
        frequency_penalty: Optional[float] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        presence_penalty: Optional[float] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
        guided_choice: Optional[List[str]] = None,
        prompt_logprobs: Optional[int] = None,
    ) -> OpenAICompletion:
        model_obj = await self._get_model(model)
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
            prompt=prompt,
            best_of=best_of,
            echo=echo,
            frequency_penalty=frequency_penalty,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_tokens=max_tokens,
            n=n,
            presence_penalty=presence_penalty,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            top_p=top_p,
            user=user,
            guided_choice=guided_choice,
            prompt_logprobs=prompt_logprobs,
        )
        return litellm.text_completion(**params)
    async def openai_chat_completion(
        self,
        model: str,
        messages: List[OpenAIMessageParam],
        frequency_penalty: Optional[float] = None,
        function_call: Optional[Union[str, Dict[str, Any]]] = None,
        functions: Optional[List[Dict[str, Any]]] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_completion_tokens: Optional[int] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        parallel_tool_calls: Optional[bool] = None,
        presence_penalty: Optional[float] = None,
        response_format: Optional[Dict[str, str]] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
        tools: Optional[List[Dict[str, Any]]] = None,
        top_logprobs: Optional[int] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
    ) -> OpenAIChatCompletion:
        model_obj = await self._get_model(model)
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
            messages=messages,
            frequency_penalty=frequency_penalty,
            function_call=function_call,
            functions=functions,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_completion_tokens=max_completion_tokens,
            max_tokens=max_tokens,
            n=n,
            parallel_tool_calls=parallel_tool_calls,
            presence_penalty=presence_penalty,
            response_format=response_format,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            tool_choice=tool_choice,
            tools=tools,
            top_logprobs=top_logprobs,
            top_p=top_p,
            user=user,
        )
        return litellm.completion(**params)
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -5,8 +5,10 @@
 # the root directory of this source tree.
 import json
 import logging
 import time
 import uuid
 import warnings
-from typing import AsyncGenerator, Dict, Iterable, List, Optional, Union
+from typing import Any, AsyncGenerator, Dict, Iterable, List, Optional, Union
 from openai import AsyncStream
 from openai.types.chat import (
@ -83,6 +85,7 @@ from llama_stack.apis.inference import (
    TopPSamplingStrategy,
    UserMessage,
 )
 from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAICompletionChoice
 from llama_stack.models.llama.datatypes import (
    BuiltinTool,
    StopReason,
@ -843,6 +846,31 @@ def _convert_openai_logprobs(
    ]
 def _convert_openai_sampling_params(
    max_tokens: Optional[int] = None,
    temperature: Optional[float] = None,
    top_p: Optional[float] = None,
 ) -> SamplingParams:
    sampling_params = SamplingParams()
    if max_tokens:
        sampling_params.max_tokens = max_tokens
    # Map an explicit temperature of 0 to greedy sampling
    if temperature == 0:
        strategy = GreedySamplingStrategy()
    else:
        # OpenAI defaults to 1.0 for temperature and top_p if unset
        if temperature is None:
            temperature = 1.0
        if top_p is None:
            top_p = 1.0
        strategy = TopPSamplingStrategy(temperature=temperature, top_p=top_p)
    sampling_params.strategy = strategy
    return sampling_params
 def convert_openai_chat_completion_choice(
    choice: OpenAIChoice,
 ) -> ChatCompletionResponse:
@ -1049,3 +1077,106 @@ async def convert_openai_chat_completion_stream(
            stop_reason=stop_reason,
        )
    )
 async def prepare_openai_completion_params(**params):
    completion_params = {k: v for k, v in params.items() if v is not None}
    return completion_params
 class OpenAICompletionUnsupportedMixin:
    async def openai_completion(
        self,
        model: str,
        prompt: Union[str, List[str], List[int], List[List[int]]],
        best_of: Optional[int] = None,
        echo: Optional[bool] = None,
        frequency_penalty: Optional[float] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        presence_penalty: Optional[float] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
        guided_choice: Optional[List[str]] = None,
        prompt_logprobs: Optional[int] = None,
    ) -> OpenAICompletion:
        if stream:
            raise ValueError(f"{self.__class__.__name__} doesn't support streaming openai completions")
        # This is a pretty hacky way to do emulate completions -
        # basically just de-batches them...
        prompts = [prompt] if not isinstance(prompt, list) else prompt
        sampling_params = _convert_openai_sampling_params(
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
        )
        choices = []
        # "n" is the number of completions to generate per prompt
        for _i in range(0, n):
            # and we may have multiple prompts, if batching was used
            for prompt in prompts:
                result = self.completion(
                    model_id=model,
                    content=prompt,
                    sampling_params=sampling_params,
                )
                index = len(choices)
                text = result.content
                finish_reason = _convert_openai_finish_reason(result.stop_reason)
                choice = OpenAICompletionChoice(
                    index=index,
                    text=text,
                    finish_reason=finish_reason,
                )
                choices.append(choice)
        return OpenAICompletion(
            id=f"cmpl-{uuid.uuid4()}",
            choices=choices,
            created=int(time.time()),
            model=model,
            object="text_completion",
        )
 class OpenAIChatCompletionUnsupportedMixin:
    async def openai_chat_completion(
        self,
        model: str,
        messages: List[OpenAIChatCompletionMessage],
        frequency_penalty: Optional[float] = None,
        function_call: Optional[Union[str, Dict[str, Any]]] = None,
        functions: Optional[List[Dict[str, Any]]] = None,
        logit_bias: Optional[Dict[str, float]] = None,
        logprobs: Optional[bool] = None,
        max_completion_tokens: Optional[int] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        parallel_tool_calls: Optional[bool] = None,
        presence_penalty: Optional[float] = None,
        response_format: Optional[Dict[str, str]] = None,
        seed: Optional[int] = None,
        stop: Optional[Union[str, List[str]]] = None,
        stream: Optional[bool] = None,
        stream_options: Optional[Dict[str, Any]] = None,
        temperature: Optional[float] = None,
        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
        tools: Optional[List[Dict[str, Any]]] = None,
        top_logprobs: Optional[int] = None,
        top_p: Optional[float] = None,
        user: Optional[str] = None,
    ) -> OpenAIChatCompletion:
        raise ValueError(f"{self.__class__.__name__} doesn't support openai chat completion")
--- a/pyproject.toml
+++ b/pyproject.toml
@ -28,6 +28,7 @@ dependencies = [
    "jinja2>=3.1.6",
    "jsonschema",
    "llama-stack-client>=0.2.1",
    "openai>=1.66",
    "prompt-toolkit",
    "python-dotenv",
    "pydantic>=2",
--- a/requirements.txt
+++ b/requirements.txt
@ -19,6 +19,7 @@ httpx==0.28.1
 huggingface-hub==0.29.0
 idna==3.10
 jinja2==3.1.6
 jiter==0.8.2
 jsonschema==4.23.0
 jsonschema-specifications==2024.10.1
 llama-stack-client==0.2.1
@ -27,6 +28,7 @@ markdown-it-py==3.0.0
 markupsafe==3.0.2
 mdurl==0.1.2
 numpy==2.2.3
 openai==1.71.0
 packaging==24.2
 pandas==2.2.3
 pillow==11.1.0
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@ -0,0 +1,216 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import pytest
 from openai import OpenAI
 from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
 from ..test_cases.test_case import TestCase
 def provider_from_model(client_with_models, model_id):
    models = {m.identifier: m for m in client_with_models.models.list()}
    models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
    provider_id = models[model_id].provider_id
    providers = {p.provider_id: p for p in client_with_models.providers.list()}
    return providers[provider_id]
 def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id):
    if isinstance(client_with_models, LlamaStackAsLibraryClient):
        pytest.skip("OpenAI completions are not supported when testing with library client yet.")
    provider = provider_from_model(client_with_models, model_id)
    if provider.provider_type in (
        "inline::meta-reference",
        "inline::sentence-transformers",
        "inline::vllm",
        "remote::bedrock",
        "remote::cerebras",
        "remote::databricks",
        # Technically Nvidia does support OpenAI completions, but none of their hosted models
        # support both completions and chat completions endpoint and all the Llama models are
        # just chat completions
        "remote::nvidia",
        "remote::runpod",
        "remote::sambanova",
        "remote::tgi",
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
 def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id):
    if isinstance(client_with_models, LlamaStackAsLibraryClient):
        pytest.skip("OpenAI chat completions are not supported when testing with library client yet.")
    provider = provider_from_model(client_with_models, model_id)
    if provider.provider_type in (
        "inline::meta-reference",
        "inline::sentence-transformers",
        "inline::vllm",
        "remote::bedrock",
        "remote::cerebras",
        "remote::databricks",
        "remote::runpod",
        "remote::sambanova",
        "remote::tgi",
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI chat completions.")
 def skip_if_provider_isnt_vllm(client_with_models, model_id):
    provider = provider_from_model(client_with_models, model_id)
    if provider.provider_type != "remote::vllm":
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support vllm extra_body parameters.")
@pytest.fixture
 def openai_client(client_with_models):
    base_url = f"{client_with_models.base_url}/v1/openai/v1"
    return OpenAI(base_url=base_url, api_key="bar")
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:completion:sanity",
    ],
 )
 def test_openai_completion_non_streaming(openai_client, client_with_models, text_model_id, test_case):
    skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
    tc = TestCase(test_case)
    # ollama needs more verbose prompting for some reason here...
    prompt = "Respond to this question and explain your answer. " + tc["content"]
    response = openai_client.completions.create(
        model=text_model_id,
        prompt=prompt,
        stream=False,
    )
    assert len(response.choices) > 0
    choice = response.choices[0]
    assert len(choice.text) > 10
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:completion:sanity",
    ],
 )
 def test_openai_completion_streaming(openai_client, client_with_models, text_model_id, test_case):
    skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
    tc = TestCase(test_case)
    # ollama needs more verbose prompting for some reason here...
    prompt = "Respond to this question and explain your answer. " + tc["content"]
    response = openai_client.completions.create(
        model=text_model_id,
        prompt=prompt,
        stream=True,
        max_tokens=50,
    )
    streamed_content = [chunk.choices[0].text for chunk in response]
    content_str = "".join(streamed_content).lower().strip()
    assert len(content_str) > 10
@pytest.mark.parametrize(
    "prompt_logprobs",
    [
        1,
        0,
    ],
 )
 def test_openai_completion_prompt_logprobs(openai_client, client_with_models, text_model_id, prompt_logprobs):
    skip_if_provider_isnt_vllm(client_with_models, text_model_id)
    prompt = "Hello, world!"
    response = openai_client.completions.create(
        model=text_model_id,
        prompt=prompt,
        stream=False,
        extra_body={
            "prompt_logprobs": prompt_logprobs,
        },
    )
    assert len(response.choices) > 0
    choice = response.choices[0]
    assert len(choice.prompt_logprobs) > 0
 def test_openai_completion_guided_choice(openai_client, client_with_models, text_model_id):
    skip_if_provider_isnt_vllm(client_with_models, text_model_id)
    prompt = "I am feeling really sad today."
    response = openai_client.completions.create(
        model=text_model_id,
        prompt=prompt,
        stream=False,
        extra_body={
            "guided_choice": ["joy", "sadness"],
        },
    )
    assert len(response.choices) > 0
    choice = response.choices[0]
    assert choice.text in ["joy", "sadness"]
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:chat_completion:non_streaming_01",
        "inference:chat_completion:non_streaming_02",
    ],
 )
 def test_openai_chat_completion_non_streaming(openai_client, client_with_models, text_model_id, test_case):
    skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
    tc = TestCase(test_case)
    question = tc["question"]
    expected = tc["expected"]
    response = openai_client.chat.completions.create(
        model=text_model_id,
        messages=[
            {
                "role": "user",
                "content": question,
            }
        ],
        stream=False,
    )
    message_content = response.choices[0].message.content.lower().strip()
    assert len(message_content) > 0
    assert expected.lower() in message_content
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:chat_completion:streaming_01",
        "inference:chat_completion:streaming_02",
    ],
 )
 def test_openai_chat_completion_streaming(openai_client, client_with_models, text_model_id, test_case):
    skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
    tc = TestCase(test_case)
    question = tc["question"]
    expected = tc["expected"]
    response = openai_client.chat.completions.create(
        model=text_model_id,
        messages=[{"role": "user", "content": question}],
        stream=True,
        timeout=120,  # Increase timeout to 2 minutes for large conversation history
    )
    streamed_content = []
    for chunk in response:
        if chunk.choices[0].delta.content:
            streamed_content.append(chunk.choices[0].delta.content.lower().strip())
    assert len(streamed_content) > 0
    assert expected.lower() in "".join(streamed_content)
--- a/uv.lock
+++ b/uv.lock
@ -1384,6 +1384,7 @@ dependencies = [
    { name = "jinja2" },
    { name = "jsonschema" },
    { name = "llama-stack-client" },
    { name = "openai" },
    { name = "pillow" },
    { name = "prompt-toolkit" },
    { name = "pydantic" },
@ -1485,6 +1486,7 @@ requires-dist = [
    { name = "mcp", marker = "extra == 'test'" },
    { name = "myst-parser", marker = "extra == 'docs'" },
    { name = "nbval", marker = "extra == 'dev'" },
    { name = "openai", specifier = ">=1.66" },
    { name = "openai", marker = "extra == 'test'" },
    { name = "openai", marker = "extra == 'unit'" },
    { name = "opentelemetry-exporter-otlp-proto-http", marker = "extra == 'test'" },
@ -2016,7 +2018,7 @@ wheels = [
 [[package]]
 name = "openai"
-version = "1.63.2"
+version = "1.71.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "anyio" },
@ -2028,9 +2030,9 @@ dependencies = [
    { name = "tqdm" },
    { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/e6/1c/11b520deb71f9ea54ced3c52cd6a5f7131215deba63ad07f23982e328141/openai-1.63.2.tar.gz", hash = "sha256:aeabeec984a7d2957b4928ceaa339e2ead19c61cfcf35ae62b7c363368d26360", size = 356902 }
+sdist = { url = "https://files.pythonhosted.org/packages/d9/19/b8f0347090a649dce55a008ec54ac6abb50553a06508cdb5e7abb2813e99/openai-1.71.0.tar.gz", hash = "sha256:52b20bb990a1780f9b0b8ccebac93416343ebd3e4e714e3eff730336833ca207", size = 409926 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/15/64/db3462b358072387b8e93e6e6a38d3c741a17b4a84171ef01d6c85c63f25/openai-1.63.2-py3-none-any.whl", hash = "sha256:1f38b27b5a40814c2b7d8759ec78110df58c4a614c25f182809ca52b080ff4d4", size = 472282 },
+    { url = "https://files.pythonhosted.org/packages/c4/f7/049e85faf6a000890e5ca0edca8e9183f8a43c9e7bba869cad871da0caba/openai-1.71.0-py3-none-any.whl", hash = "sha256:e1c643738f1fff1af52bce6ef06a7716c95d089281e7011777179614f32937aa", size = 598975 },
 ]
 [[package]]