From 26f4f3fe142ad280105426046b59f2ebcbf15c32 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Thu, 9 Jan 2025 02:03:04 -0500
Subject: [PATCH] chore(apis): unpublish deprecated /v1/inference apis

---
 docs/_static/llama-stack-spec.html            | 2319 ++++++-----------
 docs/_static/llama-stack-spec.yaml            | 1810 ++++---------
 llama_stack/apis/inference/inference.py       |    5 -
 .../inference/test_batch_inference.py         |   76 -
 tests/integration/inference/test_embedding.py |  303 ---
 .../inference/test_text_inference.py          |  543 ----
 6 files changed, 1286 insertions(+), 3770 deletions(-)
 delete mode 100644 tests/integration/inference/test_batch_inference.py
 delete mode 100644 tests/integration/inference/test_embedding.py
 delete mode 100644 tests/integration/inference/test_text_inference.py

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 7cb2a73f3..879865dea 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -86,92 +86,6 @@
                 }
             }
         },
-        "/v1/inference/batch-chat-completion": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "A BatchChatCompletionResponse with the full completions.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/BatchChatCompletionResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Inference"
-                ],
-                "description": "Generate chat completions for a batch of messages using the specified model.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/BatchChatCompletionRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
-        "/v1/inference/batch-completion": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "A BatchCompletionResponse with the full completions.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/BatchCompletionResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Inference"
-                ],
-                "description": "Generate completions for a batch of content using the specified model.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/BatchCompletionRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/v1/post-training/job/cancel": {
             "post": {
                 "responses": {
@@ -208,102 +122,6 @@
                 }
             }
         },
-        "/v1/inference/chat-completion": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "If stream=False, returns a ChatCompletionResponse with the full completion. If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/ChatCompletionResponse"
-                                }
-                            },
-                            "text/event-stream": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/ChatCompletionResponseStreamChunk"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "BatchInference (Coming Soon)"
-                ],
-                "description": "Generate a chat completion for the given messages using the specified model.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/ChatCompletionRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
-        "/v1/inference/completion": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/CompletionResponse"
-                                }
-                            },
-                            "text/event-stream": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/CompletionResponseStreamChunk"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "BatchInference (Coming Soon)"
-                ],
-                "description": "Generate a completion for the given content using the specified model.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/CompletionRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/v1/agents": {
             "get": {
                 "responses": {
@@ -901,49 +719,6 @@
                 ]
             }
         },
-        "/v1/inference/embeddings": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EmbeddingsResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Inference"
-                ],
-                "description": "Generate embeddings for content pieces using the specified model.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/EmbeddingsRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/v1/eval/benchmarks/{benchmark_id}/evaluations": {
             "post": {
                 "responses": {
@@ -5198,962 +4973,6 @@
                 ],
                 "title": "AppendRowsRequest"
             },
-            "CompletionMessage": {
-                "type": "object",
-                "properties": {
-                    "role": {
-                        "type": "string",
-                        "const": "assistant",
-                        "default": "assistant",
-                        "description": "Must be \"assistant\" to identify this as the model's response"
-                    },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The content of the model's response"
-                    },
-                    "stop_reason": {
-                        "type": "string",
-                        "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
-                        ],
-                        "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget."
-                    },
-                    "tool_calls": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ToolCall"
-                        },
-                        "description": "List of tool calls. Each tool call is a ToolCall object."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "role",
-                    "content",
-                    "stop_reason"
-                ],
-                "title": "CompletionMessage",
-                "description": "A message containing the model's (assistant) response in a chat conversation."
-            },
-            "GrammarResponseFormat": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "enum": [
-                            "json_schema",
-                            "grammar"
-                        ],
-                        "description": "Must be \"grammar\" to identify this format type",
-                        "const": "grammar",
-                        "default": "grammar"
-                    },
-                    "bnf": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        },
-                        "description": "The BNF grammar specification the response should conform to"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "bnf"
-                ],
-                "title": "GrammarResponseFormat",
-                "description": "Configuration for grammar-guided response generation."
-            },
-            "GreedySamplingStrategy": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "greedy",
-                        "default": "greedy",
-                        "description": "Must be \"greedy\" to identify this sampling strategy"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ],
-                "title": "GreedySamplingStrategy",
-                "description": "Greedy sampling strategy that selects the highest probability token at each step."
-            },
-            "ImageContentItem": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "image",
-                        "default": "image",
-                        "description": "Discriminator type of the content item. Always \"image\""
-                    },
-                    "image": {
-                        "type": "object",
-                        "properties": {
-                            "url": {
-                                "$ref": "#/components/schemas/URL",
-                                "description": "A URL of the image or data URL in the format of data:image/{type};base64,{data}. Note that URL could have length limits."
-                            },
-                            "data": {
-                                "type": "string",
-                                "contentEncoding": "base64",
-                                "description": "base64 encoded image data as string"
-                            }
-                        },
-                        "additionalProperties": false,
-                        "description": "Image as a base64 encoded string or an URL"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "image"
-                ],
-                "title": "ImageContentItem",
-                "description": "A image content item"
-            },
-            "InterleavedContent": {
-                "oneOf": [
-                    {
-                        "type": "string"
-                    },
-                    {
-                        "$ref": "#/components/schemas/InterleavedContentItem"
-                    },
-                    {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/InterleavedContentItem"
-                        }
-                    }
-                ]
-            },
-            "InterleavedContentItem": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/ImageContentItem"
-                    },
-                    {
-                        "$ref": "#/components/schemas/TextContentItem"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "image": "#/components/schemas/ImageContentItem",
-                        "text": "#/components/schemas/TextContentItem"
-                    }
-                }
-            },
-            "JsonSchemaResponseFormat": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "enum": [
-                            "json_schema",
-                            "grammar"
-                        ],
-                        "description": "Must be \"json_schema\" to identify this format type",
-                        "const": "json_schema",
-                        "default": "json_schema"
-                    },
-                    "json_schema": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        },
-                        "description": "The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "json_schema"
-                ],
-                "title": "JsonSchemaResponseFormat",
-                "description": "Configuration for JSON schema-guided response generation."
-            },
-            "Message": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/UserMessage"
-                    },
-                    {
-                        "$ref": "#/components/schemas/SystemMessage"
-                    },
-                    {
-                        "$ref": "#/components/schemas/ToolResponseMessage"
-                    },
-                    {
-                        "$ref": "#/components/schemas/CompletionMessage"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "role",
-                    "mapping": {
-                        "user": "#/components/schemas/UserMessage",
-                        "system": "#/components/schemas/SystemMessage",
-                        "tool": "#/components/schemas/ToolResponseMessage",
-                        "assistant": "#/components/schemas/CompletionMessage"
-                    }
-                }
-            },
-            "ResponseFormat": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/JsonSchemaResponseFormat"
-                    },
-                    {
-                        "$ref": "#/components/schemas/GrammarResponseFormat"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "json_schema": "#/components/schemas/JsonSchemaResponseFormat",
-                        "grammar": "#/components/schemas/GrammarResponseFormat"
-                    }
-                }
-            },
-            "SamplingParams": {
-                "type": "object",
-                "properties": {
-                    "strategy": {
-                        "$ref": "#/components/schemas/SamplingStrategy",
-                        "description": "The sampling strategy."
-                    },
-                    "max_tokens": {
-                        "type": "integer",
-                        "default": 0,
-                        "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
-                    },
-                    "repetition_penalty": {
-                        "type": "number",
-                        "default": 1.0,
-                        "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics."
-                    },
-                    "stop": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        },
-                        "description": "Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "strategy"
-                ],
-                "title": "SamplingParams",
-                "description": "Sampling parameters."
-            },
-            "SamplingStrategy": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/GreedySamplingStrategy"
-                    },
-                    {
-                        "$ref": "#/components/schemas/TopPSamplingStrategy"
-                    },
-                    {
-                        "$ref": "#/components/schemas/TopKSamplingStrategy"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "greedy": "#/components/schemas/GreedySamplingStrategy",
-                        "top_p": "#/components/schemas/TopPSamplingStrategy",
-                        "top_k": "#/components/schemas/TopKSamplingStrategy"
-                    }
-                }
-            },
-            "SystemMessage": {
-                "type": "object",
-                "properties": {
-                    "role": {
-                        "type": "string",
-                        "const": "system",
-                        "default": "system",
-                        "description": "Must be \"system\" to identify this as a system message"
-                    },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "role",
-                    "content"
-                ],
-                "title": "SystemMessage",
-                "description": "A system message providing instructions or context to the model."
-            },
-            "TextContentItem": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "text",
-                        "default": "text",
-                        "description": "Discriminator type of the content item. Always \"text\""
-                    },
-                    "text": {
-                        "type": "string",
-                        "description": "Text content"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "text"
-                ],
-                "title": "TextContentItem",
-                "description": "A text content item"
-            },
-            "ToolCall": {
-                "type": "object",
-                "properties": {
-                    "call_id": {
-                        "type": "string"
-                    },
-                    "tool_name": {
-                        "oneOf": [
-                            {
-                                "type": "string",
-                                "enum": [
-                                    "brave_search",
-                                    "wolfram_alpha",
-                                    "photogen",
-                                    "code_interpreter"
-                                ],
-                                "title": "BuiltinTool"
-                            },
-                            {
-                                "type": "string"
-                            }
-                        ]
-                    },
-                    "arguments": {
-                        "oneOf": [
-                            {
-                                "type": "string"
-                            },
-                            {
-                                "type": "object",
-                                "additionalProperties": {
-                                    "oneOf": [
-                                        {
-                                            "type": "string"
-                                        },
-                                        {
-                                            "type": "integer"
-                                        },
-                                        {
-                                            "type": "number"
-                                        },
-                                        {
-                                            "type": "boolean"
-                                        },
-                                        {
-                                            "type": "null"
-                                        },
-                                        {
-                                            "type": "array",
-                                            "items": {
-                                                "oneOf": [
-                                                    {
-                                                        "type": "string"
-                                                    },
-                                                    {
-                                                        "type": "integer"
-                                                    },
-                                                    {
-                                                        "type": "number"
-                                                    },
-                                                    {
-                                                        "type": "boolean"
-                                                    },
-                                                    {
-                                                        "type": "null"
-                                                    }
-                                                ]
-                                            }
-                                        },
-                                        {
-                                            "type": "object",
-                                            "additionalProperties": {
-                                                "oneOf": [
-                                                    {
-                                                        "type": "string"
-                                                    },
-                                                    {
-                                                        "type": "integer"
-                                                    },
-                                                    {
-                                                        "type": "number"
-                                                    },
-                                                    {
-                                                        "type": "boolean"
-                                                    },
-                                                    {
-                                                        "type": "null"
-                                                    }
-                                                ]
-                                            }
-                                        }
-                                    ]
-                                }
-                            }
-                        ]
-                    },
-                    "arguments_json": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "call_id",
-                    "tool_name",
-                    "arguments"
-                ],
-                "title": "ToolCall"
-            },
-            "ToolConfig": {
-                "type": "object",
-                "properties": {
-                    "tool_choice": {
-                        "oneOf": [
-                            {
-                                "type": "string",
-                                "enum": [
-                                    "auto",
-                                    "required",
-                                    "none"
-                                ],
-                                "title": "ToolChoice",
-                                "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
-                            },
-                            {
-                                "type": "string"
-                            }
-                        ],
-                        "default": "auto",
-                        "description": "(Optional) Whether tool use is automatic, required, or none. Can also specify a tool name to use a specific tool. Defaults to ToolChoice.auto."
-                    },
-                    "tool_prompt_format": {
-                        "type": "string",
-                        "enum": [
-                            "json",
-                            "function_tag",
-                            "python_list"
-                        ],
-                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
-                    },
-                    "system_message_behavior": {
-                        "type": "string",
-                        "enum": [
-                            "append",
-                            "replace"
-                        ],
-                        "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.",
-                        "default": "append"
-                    }
-                },
-                "additionalProperties": false,
-                "title": "ToolConfig",
-                "description": "Configuration for tool use."
-            },
-            "ToolDefinition": {
-                "type": "object",
-                "properties": {
-                    "tool_name": {
-                        "oneOf": [
-                            {
-                                "type": "string",
-                                "enum": [
-                                    "brave_search",
-                                    "wolfram_alpha",
-                                    "photogen",
-                                    "code_interpreter"
-                                ],
-                                "title": "BuiltinTool"
-                            },
-                            {
-                                "type": "string"
-                            }
-                        ]
-                    },
-                    "description": {
-                        "type": "string"
-                    },
-                    "parameters": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ToolParamDefinition"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "tool_name"
-                ],
-                "title": "ToolDefinition"
-            },
-            "ToolParamDefinition": {
-                "type": "object",
-                "properties": {
-                    "param_type": {
-                        "type": "string"
-                    },
-                    "description": {
-                        "type": "string"
-                    },
-                    "required": {
-                        "type": "boolean",
-                        "default": true
-                    },
-                    "default": {
-                        "oneOf": [
-                            {
-                                "type": "null"
-                            },
-                            {
-                                "type": "boolean"
-                            },
-                            {
-                                "type": "number"
-                            },
-                            {
-                                "type": "string"
-                            },
-                            {
-                                "type": "array"
-                            },
-                            {
-                                "type": "object"
-                            }
-                        ]
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "param_type"
-                ],
-                "title": "ToolParamDefinition"
-            },
-            "ToolResponseMessage": {
-                "type": "object",
-                "properties": {
-                    "role": {
-                        "type": "string",
-                        "const": "tool",
-                        "default": "tool",
-                        "description": "Must be \"tool\" to identify this as a tool response"
-                    },
-                    "call_id": {
-                        "type": "string",
-                        "description": "Unique identifier for the tool call this response is for"
-                    },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The response content from the tool"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "role",
-                    "call_id",
-                    "content"
-                ],
-                "title": "ToolResponseMessage",
-                "description": "A message representing the result of a tool invocation."
-            },
-            "TopKSamplingStrategy": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "top_k",
-                        "default": "top_k",
-                        "description": "Must be \"top_k\" to identify this sampling strategy"
-                    },
-                    "top_k": {
-                        "type": "integer",
-                        "description": "Number of top tokens to consider for sampling. Must be at least 1"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "top_k"
-                ],
-                "title": "TopKSamplingStrategy",
-                "description": "Top-k sampling strategy that restricts sampling to the k most likely tokens."
-            },
-            "TopPSamplingStrategy": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "top_p",
-                        "default": "top_p",
-                        "description": "Must be \"top_p\" to identify this sampling strategy"
-                    },
-                    "temperature": {
-                        "type": "number",
-                        "description": "Controls randomness in sampling. Higher values increase randomness"
-                    },
-                    "top_p": {
-                        "type": "number",
-                        "default": 0.95,
-                        "description": "Cumulative probability threshold for nucleus sampling. Defaults to 0.95"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ],
-                "title": "TopPSamplingStrategy",
-                "description": "Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p."
-            },
-            "URL": {
-                "type": "object",
-                "properties": {
-                    "uri": {
-                        "type": "string",
-                        "description": "The URL string pointing to the resource"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "uri"
-                ],
-                "title": "URL",
-                "description": "A URL reference to external content."
-            },
-            "UserMessage": {
-                "type": "object",
-                "properties": {
-                    "role": {
-                        "type": "string",
-                        "const": "user",
-                        "default": "user",
-                        "description": "Must be \"user\" to identify this as a user message"
-                    },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The content of the message, which can include text and other media"
-                    },
-                    "context": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "(Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "role",
-                    "content"
-                ],
-                "title": "UserMessage",
-                "description": "A message from the user in a chat conversation."
-            },
-            "BatchChatCompletionRequest": {
-                "type": "object",
-                "properties": {
-                    "model_id": {
-                        "type": "string",
-                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
-                    },
-                    "messages_batch": {
-                        "type": "array",
-                        "items": {
-                            "type": "array",
-                            "items": {
-                                "$ref": "#/components/schemas/Message"
-                            }
-                        },
-                        "description": "The messages to generate completions for."
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "(Optional) Parameters to control the sampling strategy."
-                    },
-                    "tools": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ToolDefinition"
-                        },
-                        "description": "(Optional) List of tool definitions available to the model."
-                    },
-                    "tool_config": {
-                        "$ref": "#/components/schemas/ToolConfig",
-                        "description": "(Optional) Configuration for tool use."
-                    },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding."
-                    },
-                    "logprobs": {
-                        "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "model_id",
-                    "messages_batch"
-                ],
-                "title": "BatchChatCompletionRequest"
-            },
-            "BatchChatCompletionResponse": {
-                "type": "object",
-                "properties": {
-                    "batch": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ChatCompletionResponse"
-                        },
-                        "description": "List of chat completion responses, one for each conversation in the batch"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "batch"
-                ],
-                "title": "BatchChatCompletionResponse",
-                "description": "Response from a batch chat completion request."
-            },
-            "ChatCompletionResponse": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/MetricInResponse"
-                        },
-                        "description": "(Optional) List of metrics associated with the API response"
-                    },
-                    "completion_message": {
-                        "$ref": "#/components/schemas/CompletionMessage",
-                        "description": "The complete response message"
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        },
-                        "description": "Optional log probabilities for generated tokens"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "completion_message"
-                ],
-                "title": "ChatCompletionResponse",
-                "description": "Response from a chat completion request."
-            },
-            "MetricInResponse": {
-                "type": "object",
-                "properties": {
-                    "metric": {
-                        "type": "string",
-                        "description": "The name of the metric"
-                    },
-                    "value": {
-                        "oneOf": [
-                            {
-                                "type": "integer"
-                            },
-                            {
-                                "type": "number"
-                            }
-                        ],
-                        "description": "The numeric value of the metric"
-                    },
-                    "unit": {
-                        "type": "string",
-                        "description": "(Optional) The unit of measurement for the metric value"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "metric",
-                    "value"
-                ],
-                "title": "MetricInResponse",
-                "description": "A metric value included in API responses."
-            },
-            "TokenLogProbs": {
-                "type": "object",
-                "properties": {
-                    "logprobs_by_token": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "type": "number"
-                        },
-                        "description": "Dictionary mapping tokens to their log probabilities"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "logprobs_by_token"
-                ],
-                "title": "TokenLogProbs",
-                "description": "Log probabilities for generated tokens."
-            },
-            "BatchCompletionRequest": {
-                "type": "object",
-                "properties": {
-                    "model_id": {
-                        "type": "string",
-                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
-                    },
-                    "content_batch": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/InterleavedContent"
-                        },
-                        "description": "The content to generate completions for."
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "(Optional) Parameters to control the sampling strategy."
-                    },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding."
-                    },
-                    "logprobs": {
-                        "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "model_id",
-                    "content_batch"
-                ],
-                "title": "BatchCompletionRequest"
-            },
-            "BatchCompletionResponse": {
-                "type": "object",
-                "properties": {
-                    "batch": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/CompletionResponse"
-                        },
-                        "description": "List of completion responses, one for each input in the batch"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "batch"
-                ],
-                "title": "BatchCompletionResponse",
-                "description": "Response from a batch completion request."
-            },
-            "CompletionResponse": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/MetricInResponse"
-                        },
-                        "description": "(Optional) List of metrics associated with the API response"
-                    },
-                    "content": {
-                        "type": "string",
-                        "description": "The generated completion text"
-                    },
-                    "stop_reason": {
-                        "type": "string",
-                        "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
-                        ],
-                        "description": "Reason why generation stopped"
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        },
-                        "description": "Optional log probabilities for generated tokens"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "content",
-                    "stop_reason"
-                ],
-                "title": "CompletionResponse",
-                "description": "Response from a completion request."
-            },
             "CancelTrainingJobRequest": {
                 "type": "object",
                 "properties": {
@@ -6168,331 +4987,6 @@
                 ],
                 "title": "CancelTrainingJobRequest"
             },
-            "ChatCompletionRequest": {
-                "type": "object",
-                "properties": {
-                    "model_id": {
-                        "type": "string",
-                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
-                    },
-                    "messages": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/Message"
-                        },
-                        "description": "List of messages in the conversation."
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "Parameters to control the sampling strategy."
-                    },
-                    "tools": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ToolDefinition"
-                        },
-                        "description": "(Optional) List of tool definitions available to the model."
-                    },
-                    "tool_choice": {
-                        "type": "string",
-                        "enum": [
-                            "auto",
-                            "required",
-                            "none"
-                        ],
-                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead."
-                    },
-                    "tool_prompt_format": {
-                        "type": "string",
-                        "enum": [
-                            "json",
-                            "function_tag",
-                            "python_list"
-                        ],
-                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead."
-                    },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
-                    },
-                    "stream": {
-                        "type": "boolean",
-                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
-                    },
-                    "logprobs": {
-                        "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
-                    },
-                    "tool_config": {
-                        "$ref": "#/components/schemas/ToolConfig",
-                        "description": "(Optional) Configuration for tool use."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "model_id",
-                    "messages"
-                ],
-                "title": "ChatCompletionRequest"
-            },
-            "ChatCompletionResponseEvent": {
-                "type": "object",
-                "properties": {
-                    "event_type": {
-                        "type": "string",
-                        "enum": [
-                            "start",
-                            "complete",
-                            "progress"
-                        ],
-                        "description": "Type of the event"
-                    },
-                    "delta": {
-                        "$ref": "#/components/schemas/ContentDelta",
-                        "description": "Content generated since last event. This can be one or more tokens, or a tool call."
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        },
-                        "description": "Optional log probabilities for generated tokens"
-                    },
-                    "stop_reason": {
-                        "type": "string",
-                        "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
-                        ],
-                        "description": "Optional reason why generation stopped, if complete"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "event_type",
-                    "delta"
-                ],
-                "title": "ChatCompletionResponseEvent",
-                "description": "An event during chat completion generation."
-            },
-            "ChatCompletionResponseStreamChunk": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/MetricInResponse"
-                        },
-                        "description": "(Optional) List of metrics associated with the API response"
-                    },
-                    "event": {
-                        "$ref": "#/components/schemas/ChatCompletionResponseEvent",
-                        "description": "The event containing the new content"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "event"
-                ],
-                "title": "ChatCompletionResponseStreamChunk",
-                "description": "A chunk of a streamed chat completion response."
-            },
-            "ContentDelta": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/TextDelta"
-                    },
-                    {
-                        "$ref": "#/components/schemas/ImageDelta"
-                    },
-                    {
-                        "$ref": "#/components/schemas/ToolCallDelta"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "text": "#/components/schemas/TextDelta",
-                        "image": "#/components/schemas/ImageDelta",
-                        "tool_call": "#/components/schemas/ToolCallDelta"
-                    }
-                }
-            },
-            "ImageDelta": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "image",
-                        "default": "image",
-                        "description": "Discriminator type of the delta. Always \"image\""
-                    },
-                    "image": {
-                        "type": "string",
-                        "contentEncoding": "base64",
-                        "description": "The incremental image data as bytes"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "image"
-                ],
-                "title": "ImageDelta",
-                "description": "An image content delta for streaming responses."
-            },
-            "TextDelta": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "text",
-                        "default": "text",
-                        "description": "Discriminator type of the delta. Always \"text\""
-                    },
-                    "text": {
-                        "type": "string",
-                        "description": "The incremental text content"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "text"
-                ],
-                "title": "TextDelta",
-                "description": "A text content delta for streaming responses."
-            },
-            "ToolCallDelta": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "tool_call",
-                        "default": "tool_call",
-                        "description": "Discriminator type of the delta. Always \"tool_call\""
-                    },
-                    "tool_call": {
-                        "oneOf": [
-                            {
-                                "type": "string"
-                            },
-                            {
-                                "$ref": "#/components/schemas/ToolCall"
-                            }
-                        ],
-                        "description": "Either an in-progress tool call string or the final parsed tool call"
-                    },
-                    "parse_status": {
-                        "type": "string",
-                        "enum": [
-                            "started",
-                            "in_progress",
-                            "failed",
-                            "succeeded"
-                        ],
-                        "description": "Current parsing status of the tool call"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "tool_call",
-                    "parse_status"
-                ],
-                "title": "ToolCallDelta",
-                "description": "A tool call content delta for streaming responses."
-            },
-            "CompletionRequest": {
-                "type": "object",
-                "properties": {
-                    "model_id": {
-                        "type": "string",
-                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
-                    },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The content to generate a completion for."
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "(Optional) Parameters to control the sampling strategy."
-                    },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding."
-                    },
-                    "stream": {
-                        "type": "boolean",
-                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
-                    },
-                    "logprobs": {
-                        "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "model_id",
-                    "content"
-                ],
-                "title": "CompletionRequest"
-            },
-            "CompletionResponseStreamChunk": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/MetricInResponse"
-                        },
-                        "description": "(Optional) List of metrics associated with the API response"
-                    },
-                    "delta": {
-                        "type": "string",
-                        "description": "New content generated since last chunk. This can be one or more tokens."
-                    },
-                    "stop_reason": {
-                        "type": "string",
-                        "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
-                        ],
-                        "description": "Optional reason why generation stopped, if complete"
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        },
-                        "description": "Optional log probabilities for generated tokens"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "delta"
-                ],
-                "title": "CompletionResponseStreamChunk",
-                "description": "A chunk of a streamed completion response."
-            },
             "AgentConfig": {
                 "type": "object",
                 "properties": {
@@ -6628,6 +5122,234 @@
                     }
                 ]
             },
+            "GrammarResponseFormat": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "enum": [
+                            "json_schema",
+                            "grammar"
+                        ],
+                        "description": "Must be \"grammar\" to identify this format type",
+                        "const": "grammar",
+                        "default": "grammar"
+                    },
+                    "bnf": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        },
+                        "description": "The BNF grammar specification the response should conform to"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "bnf"
+                ],
+                "title": "GrammarResponseFormat",
+                "description": "Configuration for grammar-guided response generation."
+            },
+            "GreedySamplingStrategy": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "greedy",
+                        "default": "greedy",
+                        "description": "Must be \"greedy\" to identify this sampling strategy"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "GreedySamplingStrategy",
+                "description": "Greedy sampling strategy that selects the highest probability token at each step."
+            },
+            "JsonSchemaResponseFormat": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "enum": [
+                            "json_schema",
+                            "grammar"
+                        ],
+                        "description": "Must be \"json_schema\" to identify this format type",
+                        "const": "json_schema",
+                        "default": "json_schema"
+                    },
+                    "json_schema": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        },
+                        "description": "The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "json_schema"
+                ],
+                "title": "JsonSchemaResponseFormat",
+                "description": "Configuration for JSON schema-guided response generation."
+            },
+            "ResponseFormat": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/JsonSchemaResponseFormat"
+                    },
+                    {
+                        "$ref": "#/components/schemas/GrammarResponseFormat"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "json_schema": "#/components/schemas/JsonSchemaResponseFormat",
+                        "grammar": "#/components/schemas/GrammarResponseFormat"
+                    }
+                }
+            },
+            "SamplingParams": {
+                "type": "object",
+                "properties": {
+                    "strategy": {
+                        "$ref": "#/components/schemas/SamplingStrategy",
+                        "description": "The sampling strategy."
+                    },
+                    "max_tokens": {
+                        "type": "integer",
+                        "default": 0,
+                        "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
+                    },
+                    "repetition_penalty": {
+                        "type": "number",
+                        "default": 1.0,
+                        "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics."
+                    },
+                    "stop": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        },
+                        "description": "Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "strategy"
+                ],
+                "title": "SamplingParams",
+                "description": "Sampling parameters."
+            },
+            "SamplingStrategy": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/GreedySamplingStrategy"
+                    },
+                    {
+                        "$ref": "#/components/schemas/TopPSamplingStrategy"
+                    },
+                    {
+                        "$ref": "#/components/schemas/TopKSamplingStrategy"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "greedy": "#/components/schemas/GreedySamplingStrategy",
+                        "top_p": "#/components/schemas/TopPSamplingStrategy",
+                        "top_k": "#/components/schemas/TopKSamplingStrategy"
+                    }
+                }
+            },
+            "ToolConfig": {
+                "type": "object",
+                "properties": {
+                    "tool_choice": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "enum": [
+                                    "auto",
+                                    "required",
+                                    "none"
+                                ],
+                                "title": "ToolChoice",
+                                "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
+                            },
+                            {
+                                "type": "string"
+                            }
+                        ],
+                        "default": "auto",
+                        "description": "(Optional) Whether tool use is automatic, required, or none. Can also specify a tool name to use a specific tool. Defaults to ToolChoice.auto."
+                    },
+                    "tool_prompt_format": {
+                        "type": "string",
+                        "enum": [
+                            "json",
+                            "function_tag",
+                            "python_list"
+                        ],
+                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
+                    },
+                    "system_message_behavior": {
+                        "type": "string",
+                        "enum": [
+                            "append",
+                            "replace"
+                        ],
+                        "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.",
+                        "default": "append"
+                    }
+                },
+                "additionalProperties": false,
+                "title": "ToolConfig",
+                "description": "Configuration for tool use."
+            },
             "ToolDef": {
                 "type": "object",
                 "properties": {
@@ -6734,6 +5456,54 @@
                 "title": "ToolParameter",
                 "description": "Parameter definition for a tool."
             },
+            "TopKSamplingStrategy": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "top_k",
+                        "default": "top_k",
+                        "description": "Must be \"top_k\" to identify this sampling strategy"
+                    },
+                    "top_k": {
+                        "type": "integer",
+                        "description": "Number of top tokens to consider for sampling. Must be at least 1"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "top_k"
+                ],
+                "title": "TopKSamplingStrategy",
+                "description": "Top-k sampling strategy that restricts sampling to the k most likely tokens."
+            },
+            "TopPSamplingStrategy": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "top_p",
+                        "default": "top_p",
+                        "description": "Must be \"top_p\" to identify this sampling strategy"
+                    },
+                    "temperature": {
+                        "type": "number",
+                        "description": "Controls randomness in sampling. Higher values increase randomness"
+                    },
+                    "top_p": {
+                        "type": "number",
+                        "default": 0.95,
+                        "description": "Cumulative probability threshold for nucleus sampling. Defaults to 0.95"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "TopPSamplingStrategy",
+                "description": "Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p."
+            },
             "CreateAgentRequest": {
                 "type": "object",
                 "properties": {
@@ -6792,6 +5562,163 @@
                 "title": "AgentSessionCreateResponse",
                 "description": "Response returned when creating a new agent session."
             },
+            "ImageContentItem": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "image",
+                        "default": "image",
+                        "description": "Discriminator type of the content item. Always \"image\""
+                    },
+                    "image": {
+                        "type": "object",
+                        "properties": {
+                            "url": {
+                                "$ref": "#/components/schemas/URL",
+                                "description": "A URL of the image or data URL in the format of data:image/{type};base64,{data}. Note that URL could have length limits."
+                            },
+                            "data": {
+                                "type": "string",
+                                "contentEncoding": "base64",
+                                "description": "base64 encoded image data as string"
+                            }
+                        },
+                        "additionalProperties": false,
+                        "description": "Image as a base64 encoded string or an URL"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "image"
+                ],
+                "title": "ImageContentItem",
+                "description": "A image content item"
+            },
+            "InterleavedContent": {
+                "oneOf": [
+                    {
+                        "type": "string"
+                    },
+                    {
+                        "$ref": "#/components/schemas/InterleavedContentItem"
+                    },
+                    {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/InterleavedContentItem"
+                        }
+                    }
+                ]
+            },
+            "InterleavedContentItem": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/ImageContentItem"
+                    },
+                    {
+                        "$ref": "#/components/schemas/TextContentItem"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "image": "#/components/schemas/ImageContentItem",
+                        "text": "#/components/schemas/TextContentItem"
+                    }
+                }
+            },
+            "TextContentItem": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "text",
+                        "default": "text",
+                        "description": "Discriminator type of the content item. Always \"text\""
+                    },
+                    "text": {
+                        "type": "string",
+                        "description": "Text content"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "text"
+                ],
+                "title": "TextContentItem",
+                "description": "A text content item"
+            },
+            "ToolResponseMessage": {
+                "type": "object",
+                "properties": {
+                    "role": {
+                        "type": "string",
+                        "const": "tool",
+                        "default": "tool",
+                        "description": "Must be \"tool\" to identify this as a tool response"
+                    },
+                    "call_id": {
+                        "type": "string",
+                        "description": "Unique identifier for the tool call this response is for"
+                    },
+                    "content": {
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The response content from the tool"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "role",
+                    "call_id",
+                    "content"
+                ],
+                "title": "ToolResponseMessage",
+                "description": "A message representing the result of a tool invocation."
+            },
+            "URL": {
+                "type": "object",
+                "properties": {
+                    "uri": {
+                        "type": "string",
+                        "description": "The URL string pointing to the resource"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "uri"
+                ],
+                "title": "URL",
+                "description": "A URL reference to external content."
+            },
+            "UserMessage": {
+                "type": "object",
+                "properties": {
+                    "role": {
+                        "type": "string",
+                        "const": "user",
+                        "default": "user",
+                        "description": "Must be \"user\" to identify this as a user message"
+                    },
+                    "content": {
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The content of the message, which can include text and other media"
+                    },
+                    "context": {
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "(Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "role",
+                    "content"
+                ],
+                "title": "UserMessage",
+                "description": "A message from the user in a chat conversation."
+            },
             "CreateAgentTurnRequest": {
                 "type": "object",
                 "properties": {
@@ -6871,6 +5798,45 @@
                 ],
                 "title": "CreateAgentTurnRequest"
             },
+            "CompletionMessage": {
+                "type": "object",
+                "properties": {
+                    "role": {
+                        "type": "string",
+                        "const": "assistant",
+                        "default": "assistant",
+                        "description": "Must be \"assistant\" to identify this as the model's response"
+                    },
+                    "content": {
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The content of the model's response"
+                    },
+                    "stop_reason": {
+                        "type": "string",
+                        "enum": [
+                            "end_of_turn",
+                            "end_of_message",
+                            "out_of_tokens"
+                        ],
+                        "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget."
+                    },
+                    "tool_calls": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ToolCall"
+                        },
+                        "description": "List of tool calls. Each tool call is a ToolCall object."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "role",
+                    "content",
+                    "stop_reason"
+                ],
+                "title": "CompletionMessage",
+                "description": "A message containing the model's (assistant) response in a chat conversation."
+            },
             "InferenceStep": {
                 "type": "object",
                 "properties": {
@@ -7068,6 +6034,114 @@
                 "title": "ShieldCallStep",
                 "description": "A shield call step in an agent turn."
             },
+            "ToolCall": {
+                "type": "object",
+                "properties": {
+                    "call_id": {
+                        "type": "string"
+                    },
+                    "tool_name": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "enum": [
+                                    "brave_search",
+                                    "wolfram_alpha",
+                                    "photogen",
+                                    "code_interpreter"
+                                ],
+                                "title": "BuiltinTool"
+                            },
+                            {
+                                "type": "string"
+                            }
+                        ]
+                    },
+                    "arguments": {
+                        "oneOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "object",
+                                "additionalProperties": {
+                                    "oneOf": [
+                                        {
+                                            "type": "string"
+                                        },
+                                        {
+                                            "type": "integer"
+                                        },
+                                        {
+                                            "type": "number"
+                                        },
+                                        {
+                                            "type": "boolean"
+                                        },
+                                        {
+                                            "type": "null"
+                                        },
+                                        {
+                                            "type": "array",
+                                            "items": {
+                                                "oneOf": [
+                                                    {
+                                                        "type": "string"
+                                                    },
+                                                    {
+                                                        "type": "integer"
+                                                    },
+                                                    {
+                                                        "type": "number"
+                                                    },
+                                                    {
+                                                        "type": "boolean"
+                                                    },
+                                                    {
+                                                        "type": "null"
+                                                    }
+                                                ]
+                                            }
+                                        },
+                                        {
+                                            "type": "object",
+                                            "additionalProperties": {
+                                                "oneOf": [
+                                                    {
+                                                        "type": "string"
+                                                    },
+                                                    {
+                                                        "type": "integer"
+                                                    },
+                                                    {
+                                                        "type": "number"
+                                                    },
+                                                    {
+                                                        "type": "boolean"
+                                                    },
+                                                    {
+                                                        "type": "null"
+                                                    }
+                                                ]
+                                            }
+                                        }
+                                    ]
+                                }
+                            }
+                        ]
+                    },
+                    "arguments_json": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "call_id",
+                    "tool_name",
+                    "arguments"
+                ],
+                "title": "ToolCall"
+            },
             "ToolExecutionStep": {
                 "type": "object",
                 "properties": {
@@ -7658,6 +6732,112 @@
                 "title": "AgentTurnResponseTurnStartPayload",
                 "description": "Payload for turn start events in agent turn responses."
             },
+            "ContentDelta": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/TextDelta"
+                    },
+                    {
+                        "$ref": "#/components/schemas/ImageDelta"
+                    },
+                    {
+                        "$ref": "#/components/schemas/ToolCallDelta"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "text": "#/components/schemas/TextDelta",
+                        "image": "#/components/schemas/ImageDelta",
+                        "tool_call": "#/components/schemas/ToolCallDelta"
+                    }
+                }
+            },
+            "ImageDelta": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "image",
+                        "default": "image",
+                        "description": "Discriminator type of the delta. Always \"image\""
+                    },
+                    "image": {
+                        "type": "string",
+                        "contentEncoding": "base64",
+                        "description": "The incremental image data as bytes"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "image"
+                ],
+                "title": "ImageDelta",
+                "description": "An image content delta for streaming responses."
+            },
+            "TextDelta": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "text",
+                        "default": "text",
+                        "description": "Discriminator type of the delta. Always \"text\""
+                    },
+                    "text": {
+                        "type": "string",
+                        "description": "The incremental text content"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "text"
+                ],
+                "title": "TextDelta",
+                "description": "A text content delta for streaming responses."
+            },
+            "ToolCallDelta": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "tool_call",
+                        "default": "tool_call",
+                        "description": "Discriminator type of the delta. Always \"tool_call\""
+                    },
+                    "tool_call": {
+                        "oneOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "$ref": "#/components/schemas/ToolCall"
+                            }
+                        ],
+                        "description": "Either an in-progress tool call string or the final parsed tool call"
+                    },
+                    "parse_status": {
+                        "type": "string",
+                        "enum": [
+                            "started",
+                            "in_progress",
+                            "failed",
+                            "succeeded"
+                        ],
+                        "description": "Current parsing status of the tool call"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "tool_call",
+                    "parse_status"
+                ],
+                "title": "ToolCallDelta",
+                "description": "A tool call content delta for streaming responses."
+            },
             "OpenAIResponseAnnotationCitation": {
                 "type": "object",
                 "properties": {
@@ -9698,80 +8878,6 @@
                 "title": "OpenAIDeleteResponseObject",
                 "description": "Response object confirming deletion of an OpenAI response."
             },
-            "EmbeddingsRequest": {
-                "type": "object",
-                "properties": {
-                    "model_id": {
-                        "type": "string",
-                        "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint."
-                    },
-                    "contents": {
-                        "oneOf": [
-                            {
-                                "type": "array",
-                                "items": {
-                                    "type": "string"
-                                }
-                            },
-                            {
-                                "type": "array",
-                                "items": {
-                                    "$ref": "#/components/schemas/InterleavedContentItem"
-                                }
-                            }
-                        ],
-                        "description": "List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text."
-                    },
-                    "text_truncation": {
-                        "type": "string",
-                        "enum": [
-                            "none",
-                            "start",
-                            "end"
-                        ],
-                        "description": "(Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length."
-                    },
-                    "output_dimension": {
-                        "type": "integer",
-                        "description": "(Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models."
-                    },
-                    "task_type": {
-                        "type": "string",
-                        "enum": [
-                            "query",
-                            "document"
-                        ],
-                        "description": "(Optional) How is the embedding being used? This is only supported by asymmetric embedding models."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "model_id",
-                    "contents"
-                ],
-                "title": "EmbeddingsRequest"
-            },
-            "EmbeddingsResponse": {
-                "type": "object",
-                "properties": {
-                    "embeddings": {
-                        "type": "array",
-                        "items": {
-                            "type": "array",
-                            "items": {
-                                "type": "number"
-                            }
-                        },
-                        "description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "embeddings"
-                ],
-                "title": "EmbeddingsResponse",
-                "description": "Response containing generated embeddings."
-            },
             "AgentCandidate": {
                 "type": "object",
                 "properties": {
@@ -10010,6 +9116,28 @@
                 "title": "ScoringFnParamsType",
                 "description": "Types of scoring function parameter configurations."
             },
+            "SystemMessage": {
+                "type": "object",
+                "properties": {
+                    "role": {
+                        "type": "string",
+                        "const": "system",
+                        "default": "system",
+                        "description": "Must be \"system\" to identify this as a system message"
+                    },
+                    "content": {
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "role",
+                    "content"
+                ],
+                "title": "SystemMessage",
+                "description": "A system message providing instructions or context to the model."
+            },
             "EvaluateRowsRequest": {
                 "type": "object",
                 "properties": {
@@ -16905,6 +16033,31 @@
                 "title": "ModerationObjectResults",
                 "description": "A moderation object."
             },
+            "Message": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/UserMessage"
+                    },
+                    {
+                        "$ref": "#/components/schemas/SystemMessage"
+                    },
+                    {
+                        "$ref": "#/components/schemas/ToolResponseMessage"
+                    },
+                    {
+                        "$ref": "#/components/schemas/CompletionMessage"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "role",
+                    "mapping": {
+                        "user": "#/components/schemas/UserMessage",
+                        "system": "#/components/schemas/SystemMessage",
+                        "tool": "#/components/schemas/ToolResponseMessage",
+                        "assistant": "#/components/schemas/CompletionMessage"
+                    }
+                }
+            },
             "RunShieldRequest": {
                 "type": "object",
                 "properties": {
@@ -17502,11 +16655,6 @@
             "description": "Main functionalities provided by this API:\n- Create agents with specific instructions and ability to use tools.\n- Interactions with agents are grouped into sessions (\"threads\"), and each interaction is called a \"turn\".\n- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).\n- Agents can be provided with various shields (see the Safety API for more details).\n- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.",
             "x-displayName": "Agents API for creating and interacting with agentic systems."
         },
-        {
-            "name": "BatchInference (Coming Soon)",
-            "description": "This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.\n\nNOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs\nincluding (post-training, evals, etc).",
-            "x-displayName": "Batch inference API for generating completions and chat completions."
-        },
         {
             "name": "Benchmarks"
         },
@@ -17577,7 +16725,6 @@
             "name": "Operations",
             "tags": [
                 "Agents",
-                "BatchInference (Coming Soon)",
                 "Benchmarks",
                 "DatasetIO",
                 "Datasets",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 25089868c..13f0a643d 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -42,68 +42,6 @@ paths:
             schema:
               $ref: '#/components/schemas/AppendRowsRequest'
         required: true
-  /v1/inference/batch-chat-completion:
-    post:
-      responses:
-        '200':
-          description: >-
-            A BatchChatCompletionResponse with the full completions.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/BatchChatCompletionResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Inference
-      description: >-
-        Generate chat completions for a batch of messages using the specified model.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/BatchChatCompletionRequest'
-        required: true
-  /v1/inference/batch-completion:
-    post:
-      responses:
-        '200':
-          description: >-
-            A BatchCompletionResponse with the full completions.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/BatchCompletionResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Inference
-      description: >-
-        Generate completions for a batch of content using the specified model.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/BatchCompletionRequest'
-        required: true
   /v1/post-training/job/cancel:
     post:
       responses:
@@ -129,76 +67,6 @@ paths:
             schema:
               $ref: '#/components/schemas/CancelTrainingJobRequest'
         required: true
-  /v1/inference/chat-completion:
-    post:
-      responses:
-        '200':
-          description: >-
-            If stream=False, returns a ChatCompletionResponse with the full completion.
-            If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ChatCompletionResponse'
-            text/event-stream:
-              schema:
-                $ref: '#/components/schemas/ChatCompletionResponseStreamChunk'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - BatchInference (Coming Soon)
-      description: >-
-        Generate a chat completion for the given messages using the specified model.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/ChatCompletionRequest'
-        required: true
-  /v1/inference/completion:
-    post:
-      responses:
-        '200':
-          description: >-
-            If stream=False, returns a CompletionResponse with the full completion.
-            If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/CompletionResponse'
-            text/event-stream:
-              schema:
-                $ref: '#/components/schemas/CompletionResponseStreamChunk'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - BatchInference (Coming Soon)
-      description: >-
-        Generate a completion for the given content using the specified model.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/CompletionRequest'
-        required: true
   /v1/agents:
     get:
       responses:
@@ -616,39 +484,6 @@ paths:
           required: true
           schema:
             type: string
-  /v1/inference/embeddings:
-    post:
-      responses:
-        '200':
-          description: >-
-            An array of embeddings, one for each content. Each embedding is a list
-            of floats. The dimensionality of the embedding is model-specific; you
-            can check model metadata using /models/{model_id}.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EmbeddingsResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Inference
-      description: >-
-        Generate embeddings for content pieces using the specified model.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/EmbeddingsRequest'
-        required: true
   /v1/eval/benchmarks/{benchmark_id}/evaluations:
     post:
       responses:
@@ -3682,731 +3517,6 @@ components:
       required:
         - rows
       title: AppendRowsRequest
-    CompletionMessage:
-      type: object
-      properties:
-        role:
-          type: string
-          const: assistant
-          default: assistant
-          description: >-
-            Must be "assistant" to identify this as the model's response
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: The content of the model's response
-        stop_reason:
-          type: string
-          enum:
-            - end_of_turn
-            - end_of_message
-            - out_of_tokens
-          description: >-
-            Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`:
-            The model finished generating the entire response. - `StopReason.end_of_message`:
-            The model finished generating but generated a partial response -- usually,
-            a tool call. The user may call the tool and continue the conversation
-            with the tool's response. - `StopReason.out_of_tokens`: The model ran
-            out of token budget.
-        tool_calls:
-          type: array
-          items:
-            $ref: '#/components/schemas/ToolCall'
-          description: >-
-            List of tool calls. Each tool call is a ToolCall object.
-      additionalProperties: false
-      required:
-        - role
-        - content
-        - stop_reason
-      title: CompletionMessage
-      description: >-
-        A message containing the model's (assistant) response in a chat conversation.
-    GrammarResponseFormat:
-      type: object
-      properties:
-        type:
-          type: string
-          enum:
-            - json_schema
-            - grammar
-          description: >-
-            Must be "grammar" to identify this format type
-          const: grammar
-          default: grammar
-        bnf:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: >-
-            The BNF grammar specification the response should conform to
-      additionalProperties: false
-      required:
-        - type
-        - bnf
-      title: GrammarResponseFormat
-      description: >-
-        Configuration for grammar-guided response generation.
-    GreedySamplingStrategy:
-      type: object
-      properties:
-        type:
-          type: string
-          const: greedy
-          default: greedy
-          description: >-
-            Must be "greedy" to identify this sampling strategy
-      additionalProperties: false
-      required:
-        - type
-      title: GreedySamplingStrategy
-      description: >-
-        Greedy sampling strategy that selects the highest probability token at each
-        step.
-    ImageContentItem:
-      type: object
-      properties:
-        type:
-          type: string
-          const: image
-          default: image
-          description: >-
-            Discriminator type of the content item. Always "image"
-        image:
-          type: object
-          properties:
-            url:
-              $ref: '#/components/schemas/URL'
-              description: >-
-                A URL of the image or data URL in the format of data:image/{type};base64,{data}.
-                Note that URL could have length limits.
-            data:
-              type: string
-              contentEncoding: base64
-              description: base64 encoded image data as string
-          additionalProperties: false
-          description: >-
-            Image as a base64 encoded string or an URL
-      additionalProperties: false
-      required:
-        - type
-        - image
-      title: ImageContentItem
-      description: A image content item
-    InterleavedContent:
-      oneOf:
-        - type: string
-        - $ref: '#/components/schemas/InterleavedContentItem'
-        - type: array
-          items:
-            $ref: '#/components/schemas/InterleavedContentItem'
-    InterleavedContentItem:
-      oneOf:
-        - $ref: '#/components/schemas/ImageContentItem'
-        - $ref: '#/components/schemas/TextContentItem'
-      discriminator:
-        propertyName: type
-        mapping:
-          image: '#/components/schemas/ImageContentItem'
-          text: '#/components/schemas/TextContentItem'
-    JsonSchemaResponseFormat:
-      type: object
-      properties:
-        type:
-          type: string
-          enum:
-            - json_schema
-            - grammar
-          description: >-
-            Must be "json_schema" to identify this format type
-          const: json_schema
-          default: json_schema
-        json_schema:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: >-
-            The JSON schema the response should conform to. In a Python SDK, this
-            is often a `pydantic` model.
-      additionalProperties: false
-      required:
-        - type
-        - json_schema
-      title: JsonSchemaResponseFormat
-      description: >-
-        Configuration for JSON schema-guided response generation.
-    Message:
-      oneOf:
-        - $ref: '#/components/schemas/UserMessage'
-        - $ref: '#/components/schemas/SystemMessage'
-        - $ref: '#/components/schemas/ToolResponseMessage'
-        - $ref: '#/components/schemas/CompletionMessage'
-      discriminator:
-        propertyName: role
-        mapping:
-          user: '#/components/schemas/UserMessage'
-          system: '#/components/schemas/SystemMessage'
-          tool: '#/components/schemas/ToolResponseMessage'
-          assistant: '#/components/schemas/CompletionMessage'
-    ResponseFormat:
-      oneOf:
-        - $ref: '#/components/schemas/JsonSchemaResponseFormat'
-        - $ref: '#/components/schemas/GrammarResponseFormat'
-      discriminator:
-        propertyName: type
-        mapping:
-          json_schema: '#/components/schemas/JsonSchemaResponseFormat'
-          grammar: '#/components/schemas/GrammarResponseFormat'
-    SamplingParams:
-      type: object
-      properties:
-        strategy:
-          $ref: '#/components/schemas/SamplingStrategy'
-          description: The sampling strategy.
-        max_tokens:
-          type: integer
-          default: 0
-          description: >-
-            The maximum number of tokens that can be generated in the completion.
-            The token count of your prompt plus max_tokens cannot exceed the model's
-            context length.
-        repetition_penalty:
-          type: number
-          default: 1.0
-          description: >-
-            Number between -2.0 and 2.0. Positive values penalize new tokens based
-            on whether they appear in the text so far, increasing the model's likelihood
-            to talk about new topics.
-        stop:
-          type: array
-          items:
-            type: string
-          description: >-
-            Up to 4 sequences where the API will stop generating further tokens. The
-            returned text will not contain the stop sequence.
-      additionalProperties: false
-      required:
-        - strategy
-      title: SamplingParams
-      description: Sampling parameters.
-    SamplingStrategy:
-      oneOf:
-        - $ref: '#/components/schemas/GreedySamplingStrategy'
-        - $ref: '#/components/schemas/TopPSamplingStrategy'
-        - $ref: '#/components/schemas/TopKSamplingStrategy'
-      discriminator:
-        propertyName: type
-        mapping:
-          greedy: '#/components/schemas/GreedySamplingStrategy'
-          top_p: '#/components/schemas/TopPSamplingStrategy'
-          top_k: '#/components/schemas/TopKSamplingStrategy'
-    SystemMessage:
-      type: object
-      properties:
-        role:
-          type: string
-          const: system
-          default: system
-          description: >-
-            Must be "system" to identify this as a system message
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            The content of the "system prompt". If multiple system messages are provided,
-            they are concatenated. The underlying Llama Stack code may also add other
-            system messages (for example, for formatting tool definitions).
-      additionalProperties: false
-      required:
-        - role
-        - content
-      title: SystemMessage
-      description: >-
-        A system message providing instructions or context to the model.
-    TextContentItem:
-      type: object
-      properties:
-        type:
-          type: string
-          const: text
-          default: text
-          description: >-
-            Discriminator type of the content item. Always "text"
-        text:
-          type: string
-          description: Text content
-      additionalProperties: false
-      required:
-        - type
-        - text
-      title: TextContentItem
-      description: A text content item
-    ToolCall:
-      type: object
-      properties:
-        call_id:
-          type: string
-        tool_name:
-          oneOf:
-            - type: string
-              enum:
-                - brave_search
-                - wolfram_alpha
-                - photogen
-                - code_interpreter
-              title: BuiltinTool
-            - type: string
-        arguments:
-          oneOf:
-            - type: string
-            - type: object
-              additionalProperties:
-                oneOf:
-                  - type: string
-                  - type: integer
-                  - type: number
-                  - type: boolean
-                  - type: 'null'
-                  - type: array
-                    items:
-                      oneOf:
-                        - type: string
-                        - type: integer
-                        - type: number
-                        - type: boolean
-                        - type: 'null'
-                  - type: object
-                    additionalProperties:
-                      oneOf:
-                        - type: string
-                        - type: integer
-                        - type: number
-                        - type: boolean
-                        - type: 'null'
-        arguments_json:
-          type: string
-      additionalProperties: false
-      required:
-        - call_id
-        - tool_name
-        - arguments
-      title: ToolCall
-    ToolConfig:
-      type: object
-      properties:
-        tool_choice:
-          oneOf:
-            - type: string
-              enum:
-                - auto
-                - required
-                - none
-              title: ToolChoice
-              description: >-
-                Whether tool use is required or automatic. This is a hint to the model
-                which may not be followed. It depends on the Instruction Following
-                capabilities of the model.
-            - type: string
-          default: auto
-          description: >-
-            (Optional) Whether tool use is automatic, required, or none. Can also
-            specify a tool name to use a specific tool. Defaults to ToolChoice.auto.
-        tool_prompt_format:
-          type: string
-          enum:
-            - json
-            - function_tag
-            - python_list
-          description: >-
-            (Optional) Instructs the model how to format tool calls. By default, Llama
-            Stack will attempt to use a format that is best adapted to the model.
-            - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
-            - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name>
-            tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
-            syntax -- a list of function calls.
-        system_message_behavior:
-          type: string
-          enum:
-            - append
-            - replace
-          description: >-
-            (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
-            Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
-            Replaces the default system prompt with the provided system message. The
-            system message can include the string '{{function_definitions}}' to indicate
-            where the function definitions should be inserted.
-          default: append
-      additionalProperties: false
-      title: ToolConfig
-      description: Configuration for tool use.
-    ToolDefinition:
-      type: object
-      properties:
-        tool_name:
-          oneOf:
-            - type: string
-              enum:
-                - brave_search
-                - wolfram_alpha
-                - photogen
-                - code_interpreter
-              title: BuiltinTool
-            - type: string
-        description:
-          type: string
-        parameters:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ToolParamDefinition'
-      additionalProperties: false
-      required:
-        - tool_name
-      title: ToolDefinition
-    ToolParamDefinition:
-      type: object
-      properties:
-        param_type:
-          type: string
-        description:
-          type: string
-        required:
-          type: boolean
-          default: true
-        default:
-          oneOf:
-            - type: 'null'
-            - type: boolean
-            - type: number
-            - type: string
-            - type: array
-            - type: object
-      additionalProperties: false
-      required:
-        - param_type
-      title: ToolParamDefinition
-    ToolResponseMessage:
-      type: object
-      properties:
-        role:
-          type: string
-          const: tool
-          default: tool
-          description: >-
-            Must be "tool" to identify this as a tool response
-        call_id:
-          type: string
-          description: >-
-            Unique identifier for the tool call this response is for
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: The response content from the tool
-      additionalProperties: false
-      required:
-        - role
-        - call_id
-        - content
-      title: ToolResponseMessage
-      description: >-
-        A message representing the result of a tool invocation.
-    TopKSamplingStrategy:
-      type: object
-      properties:
-        type:
-          type: string
-          const: top_k
-          default: top_k
-          description: >-
-            Must be "top_k" to identify this sampling strategy
-        top_k:
-          type: integer
-          description: >-
-            Number of top tokens to consider for sampling. Must be at least 1
-      additionalProperties: false
-      required:
-        - type
-        - top_k
-      title: TopKSamplingStrategy
-      description: >-
-        Top-k sampling strategy that restricts sampling to the k most likely tokens.
-    TopPSamplingStrategy:
-      type: object
-      properties:
-        type:
-          type: string
-          const: top_p
-          default: top_p
-          description: >-
-            Must be "top_p" to identify this sampling strategy
-        temperature:
-          type: number
-          description: >-
-            Controls randomness in sampling. Higher values increase randomness
-        top_p:
-          type: number
-          default: 0.95
-          description: >-
-            Cumulative probability threshold for nucleus sampling. Defaults to 0.95
-      additionalProperties: false
-      required:
-        - type
-      title: TopPSamplingStrategy
-      description: >-
-        Top-p (nucleus) sampling strategy that samples from the smallest set of tokens
-        with cumulative probability >= p.
-    URL:
-      type: object
-      properties:
-        uri:
-          type: string
-          description: The URL string pointing to the resource
-      additionalProperties: false
-      required:
-        - uri
-      title: URL
-      description: A URL reference to external content.
-    UserMessage:
-      type: object
-      properties:
-        role:
-          type: string
-          const: user
-          default: user
-          description: >-
-            Must be "user" to identify this as a user message
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            The content of the message, which can include text and other media
-        context:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            (Optional) This field is used internally by Llama Stack to pass RAG context.
-            This field may be removed in the API in the future.
-      additionalProperties: false
-      required:
-        - role
-        - content
-      title: UserMessage
-      description: >-
-        A message from the user in a chat conversation.
-    BatchChatCompletionRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: >-
-            The identifier of the model to use. The model must be registered with
-            Llama Stack and available via the /models endpoint.
-        messages_batch:
-          type: array
-          items:
-            type: array
-            items:
-              $ref: '#/components/schemas/Message'
-          description: >-
-            The messages to generate completions for.
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: >-
-            (Optional) Parameters to control the sampling strategy.
-        tools:
-          type: array
-          items:
-            $ref: '#/components/schemas/ToolDefinition'
-          description: >-
-            (Optional) List of tool definitions available to the model.
-        tool_config:
-          $ref: '#/components/schemas/ToolConfig'
-          description: (Optional) Configuration for tool use.
-        response_format:
-          $ref: '#/components/schemas/ResponseFormat'
-          description: >-
-            (Optional) Grammar specification for guided (structured) decoding.
-        logprobs:
-          type: object
-          properties:
-            top_k:
-              type: integer
-              default: 0
-              description: >-
-                How many tokens (for each position) to return log probabilities for.
-          additionalProperties: false
-          description: >-
-            (Optional) If specified, log probabilities for each token position will
-            be returned.
-      additionalProperties: false
-      required:
-        - model_id
-        - messages_batch
-      title: BatchChatCompletionRequest
-    BatchChatCompletionResponse:
-      type: object
-      properties:
-        batch:
-          type: array
-          items:
-            $ref: '#/components/schemas/ChatCompletionResponse'
-          description: >-
-            List of chat completion responses, one for each conversation in the batch
-      additionalProperties: false
-      required:
-        - batch
-      title: BatchChatCompletionResponse
-      description: >-
-        Response from a batch chat completion request.
-    ChatCompletionResponse:
-      type: object
-      properties:
-        metrics:
-          type: array
-          items:
-            $ref: '#/components/schemas/MetricInResponse'
-          description: >-
-            (Optional) List of metrics associated with the API response
-        completion_message:
-          $ref: '#/components/schemas/CompletionMessage'
-          description: The complete response message
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-          description: >-
-            Optional log probabilities for generated tokens
-      additionalProperties: false
-      required:
-        - completion_message
-      title: ChatCompletionResponse
-      description: Response from a chat completion request.
-    MetricInResponse:
-      type: object
-      properties:
-        metric:
-          type: string
-          description: The name of the metric
-        value:
-          oneOf:
-            - type: integer
-            - type: number
-          description: The numeric value of the metric
-        unit:
-          type: string
-          description: >-
-            (Optional) The unit of measurement for the metric value
-      additionalProperties: false
-      required:
-        - metric
-        - value
-      title: MetricInResponse
-      description: >-
-        A metric value included in API responses.
-    TokenLogProbs:
-      type: object
-      properties:
-        logprobs_by_token:
-          type: object
-          additionalProperties:
-            type: number
-          description: >-
-            Dictionary mapping tokens to their log probabilities
-      additionalProperties: false
-      required:
-        - logprobs_by_token
-      title: TokenLogProbs
-      description: Log probabilities for generated tokens.
-    BatchCompletionRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: >-
-            The identifier of the model to use. The model must be registered with
-            Llama Stack and available via the /models endpoint.
-        content_batch:
-          type: array
-          items:
-            $ref: '#/components/schemas/InterleavedContent'
-          description: The content to generate completions for.
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: >-
-            (Optional) Parameters to control the sampling strategy.
-        response_format:
-          $ref: '#/components/schemas/ResponseFormat'
-          description: >-
-            (Optional) Grammar specification for guided (structured) decoding.
-        logprobs:
-          type: object
-          properties:
-            top_k:
-              type: integer
-              default: 0
-              description: >-
-                How many tokens (for each position) to return log probabilities for.
-          additionalProperties: false
-          description: >-
-            (Optional) If specified, log probabilities for each token position will
-            be returned.
-      additionalProperties: false
-      required:
-        - model_id
-        - content_batch
-      title: BatchCompletionRequest
-    BatchCompletionResponse:
-      type: object
-      properties:
-        batch:
-          type: array
-          items:
-            $ref: '#/components/schemas/CompletionResponse'
-          description: >-
-            List of completion responses, one for each input in the batch
-      additionalProperties: false
-      required:
-        - batch
-      title: BatchCompletionResponse
-      description: >-
-        Response from a batch completion request.
-    CompletionResponse:
-      type: object
-      properties:
-        metrics:
-          type: array
-          items:
-            $ref: '#/components/schemas/MetricInResponse'
-          description: >-
-            (Optional) List of metrics associated with the API response
-        content:
-          type: string
-          description: The generated completion text
-        stop_reason:
-          type: string
-          enum:
-            - end_of_turn
-            - end_of_message
-            - out_of_tokens
-          description: Reason why generation stopped
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-          description: >-
-            Optional log probabilities for generated tokens
-      additionalProperties: false
-      required:
-        - content
-        - stop_reason
-      title: CompletionResponse
-      description: Response from a completion request.
     CancelTrainingJobRequest:
       type: object
       properties:
@@ -4417,294 +3527,6 @@ components:
       required:
         - job_uuid
       title: CancelTrainingJobRequest
-    ChatCompletionRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: >-
-            The identifier of the model to use. The model must be registered with
-            Llama Stack and available via the /models endpoint.
-        messages:
-          type: array
-          items:
-            $ref: '#/components/schemas/Message'
-          description: List of messages in the conversation.
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: >-
-            Parameters to control the sampling strategy.
-        tools:
-          type: array
-          items:
-            $ref: '#/components/schemas/ToolDefinition'
-          description: >-
-            (Optional) List of tool definitions available to the model.
-        tool_choice:
-          type: string
-          enum:
-            - auto
-            - required
-            - none
-          description: >-
-            (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
-            .. deprecated:: Use tool_config instead.
-        tool_prompt_format:
-          type: string
-          enum:
-            - json
-            - function_tag
-            - python_list
-          description: >-
-            (Optional) Instructs the model how to format tool calls. By default, Llama
-            Stack will attempt to use a format that is best adapted to the model.
-            - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
-            - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name>
-            tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
-            syntax -- a list of function calls. .. deprecated:: Use tool_config instead.
-        response_format:
-          $ref: '#/components/schemas/ResponseFormat'
-          description: >-
-            (Optional) Grammar specification for guided (structured) decoding. There
-            are two options: - `ResponseFormat.json_schema`: The grammar is a JSON
-            schema. Most providers support this format. - `ResponseFormat.grammar`:
-            The grammar is a BNF grammar. This format is more flexible, but not all
-            providers support it.
-        stream:
-          type: boolean
-          description: >-
-            (Optional) If True, generate an SSE event stream of the response. Defaults
-            to False.
-        logprobs:
-          type: object
-          properties:
-            top_k:
-              type: integer
-              default: 0
-              description: >-
-                How many tokens (for each position) to return log probabilities for.
-          additionalProperties: false
-          description: >-
-            (Optional) If specified, log probabilities for each token position will
-            be returned.
-        tool_config:
-          $ref: '#/components/schemas/ToolConfig'
-          description: (Optional) Configuration for tool use.
-      additionalProperties: false
-      required:
-        - model_id
-        - messages
-      title: ChatCompletionRequest
-    ChatCompletionResponseEvent:
-      type: object
-      properties:
-        event_type:
-          type: string
-          enum:
-            - start
-            - complete
-            - progress
-          description: Type of the event
-        delta:
-          $ref: '#/components/schemas/ContentDelta'
-          description: >-
-            Content generated since last event. This can be one or more tokens, or
-            a tool call.
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-          description: >-
-            Optional log probabilities for generated tokens
-        stop_reason:
-          type: string
-          enum:
-            - end_of_turn
-            - end_of_message
-            - out_of_tokens
-          description: >-
-            Optional reason why generation stopped, if complete
-      additionalProperties: false
-      required:
-        - event_type
-        - delta
-      title: ChatCompletionResponseEvent
-      description: >-
-        An event during chat completion generation.
-    ChatCompletionResponseStreamChunk:
-      type: object
-      properties:
-        metrics:
-          type: array
-          items:
-            $ref: '#/components/schemas/MetricInResponse'
-          description: >-
-            (Optional) List of metrics associated with the API response
-        event:
-          $ref: '#/components/schemas/ChatCompletionResponseEvent'
-          description: The event containing the new content
-      additionalProperties: false
-      required:
-        - event
-      title: ChatCompletionResponseStreamChunk
-      description: >-
-        A chunk of a streamed chat completion response.
-    ContentDelta:
-      oneOf:
-        - $ref: '#/components/schemas/TextDelta'
-        - $ref: '#/components/schemas/ImageDelta'
-        - $ref: '#/components/schemas/ToolCallDelta'
-      discriminator:
-        propertyName: type
-        mapping:
-          text: '#/components/schemas/TextDelta'
-          image: '#/components/schemas/ImageDelta'
-          tool_call: '#/components/schemas/ToolCallDelta'
-    ImageDelta:
-      type: object
-      properties:
-        type:
-          type: string
-          const: image
-          default: image
-          description: >-
-            Discriminator type of the delta. Always "image"
-        image:
-          type: string
-          contentEncoding: base64
-          description: The incremental image data as bytes
-      additionalProperties: false
-      required:
-        - type
-        - image
-      title: ImageDelta
-      description: >-
-        An image content delta for streaming responses.
-    TextDelta:
-      type: object
-      properties:
-        type:
-          type: string
-          const: text
-          default: text
-          description: >-
-            Discriminator type of the delta. Always "text"
-        text:
-          type: string
-          description: The incremental text content
-      additionalProperties: false
-      required:
-        - type
-        - text
-      title: TextDelta
-      description: >-
-        A text content delta for streaming responses.
-    ToolCallDelta:
-      type: object
-      properties:
-        type:
-          type: string
-          const: tool_call
-          default: tool_call
-          description: >-
-            Discriminator type of the delta. Always "tool_call"
-        tool_call:
-          oneOf:
-            - type: string
-            - $ref: '#/components/schemas/ToolCall'
-          description: >-
-            Either an in-progress tool call string or the final parsed tool call
-        parse_status:
-          type: string
-          enum:
-            - started
-            - in_progress
-            - failed
-            - succeeded
-          description: Current parsing status of the tool call
-      additionalProperties: false
-      required:
-        - type
-        - tool_call
-        - parse_status
-      title: ToolCallDelta
-      description: >-
-        A tool call content delta for streaming responses.
-    CompletionRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: >-
-            The identifier of the model to use. The model must be registered with
-            Llama Stack and available via the /models endpoint.
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            The content to generate a completion for.
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: >-
-            (Optional) Parameters to control the sampling strategy.
-        response_format:
-          $ref: '#/components/schemas/ResponseFormat'
-          description: >-
-            (Optional) Grammar specification for guided (structured) decoding.
-        stream:
-          type: boolean
-          description: >-
-            (Optional) If True, generate an SSE event stream of the response. Defaults
-            to False.
-        logprobs:
-          type: object
-          properties:
-            top_k:
-              type: integer
-              default: 0
-              description: >-
-                How many tokens (for each position) to return log probabilities for.
-          additionalProperties: false
-          description: >-
-            (Optional) If specified, log probabilities for each token position will
-            be returned.
-      additionalProperties: false
-      required:
-        - model_id
-        - content
-      title: CompletionRequest
-    CompletionResponseStreamChunk:
-      type: object
-      properties:
-        metrics:
-          type: array
-          items:
-            $ref: '#/components/schemas/MetricInResponse'
-          description: >-
-            (Optional) List of metrics associated with the API response
-        delta:
-          type: string
-          description: >-
-            New content generated since last chunk. This can be one or more tokens.
-        stop_reason:
-          type: string
-          enum:
-            - end_of_turn
-            - end_of_message
-            - out_of_tokens
-          description: >-
-            Optional reason why generation stopped, if complete
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-          description: >-
-            Optional log probabilities for generated tokens
-      additionalProperties: false
-      required:
-        - delta
-      title: CompletionResponseStreamChunk
-      description: >-
-        A chunk of a streamed completion response.
     AgentConfig:
       type: object
       properties:
@@ -4800,6 +3622,185 @@ components:
             - name
             - args
           title: AgentToolGroupWithArgs
+    GrammarResponseFormat:
+      type: object
+      properties:
+        type:
+          type: string
+          enum:
+            - json_schema
+            - grammar
+          description: >-
+            Must be "grammar" to identify this format type
+          const: grammar
+          default: grammar
+        bnf:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+          description: >-
+            The BNF grammar specification the response should conform to
+      additionalProperties: false
+      required:
+        - type
+        - bnf
+      title: GrammarResponseFormat
+      description: >-
+        Configuration for grammar-guided response generation.
+    GreedySamplingStrategy:
+      type: object
+      properties:
+        type:
+          type: string
+          const: greedy
+          default: greedy
+          description: >-
+            Must be "greedy" to identify this sampling strategy
+      additionalProperties: false
+      required:
+        - type
+      title: GreedySamplingStrategy
+      description: >-
+        Greedy sampling strategy that selects the highest probability token at each
+        step.
+    JsonSchemaResponseFormat:
+      type: object
+      properties:
+        type:
+          type: string
+          enum:
+            - json_schema
+            - grammar
+          description: >-
+            Must be "json_schema" to identify this format type
+          const: json_schema
+          default: json_schema
+        json_schema:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+          description: >-
+            The JSON schema the response should conform to. In a Python SDK, this
+            is often a `pydantic` model.
+      additionalProperties: false
+      required:
+        - type
+        - json_schema
+      title: JsonSchemaResponseFormat
+      description: >-
+        Configuration for JSON schema-guided response generation.
+    ResponseFormat:
+      oneOf:
+        - $ref: '#/components/schemas/JsonSchemaResponseFormat'
+        - $ref: '#/components/schemas/GrammarResponseFormat'
+      discriminator:
+        propertyName: type
+        mapping:
+          json_schema: '#/components/schemas/JsonSchemaResponseFormat'
+          grammar: '#/components/schemas/GrammarResponseFormat'
+    SamplingParams:
+      type: object
+      properties:
+        strategy:
+          $ref: '#/components/schemas/SamplingStrategy'
+          description: The sampling strategy.
+        max_tokens:
+          type: integer
+          default: 0
+          description: >-
+            The maximum number of tokens that can be generated in the completion.
+            The token count of your prompt plus max_tokens cannot exceed the model's
+            context length.
+        repetition_penalty:
+          type: number
+          default: 1.0
+          description: >-
+            Number between -2.0 and 2.0. Positive values penalize new tokens based
+            on whether they appear in the text so far, increasing the model's likelihood
+            to talk about new topics.
+        stop:
+          type: array
+          items:
+            type: string
+          description: >-
+            Up to 4 sequences where the API will stop generating further tokens. The
+            returned text will not contain the stop sequence.
+      additionalProperties: false
+      required:
+        - strategy
+      title: SamplingParams
+      description: Sampling parameters.
+    SamplingStrategy:
+      oneOf:
+        - $ref: '#/components/schemas/GreedySamplingStrategy'
+        - $ref: '#/components/schemas/TopPSamplingStrategy'
+        - $ref: '#/components/schemas/TopKSamplingStrategy'
+      discriminator:
+        propertyName: type
+        mapping:
+          greedy: '#/components/schemas/GreedySamplingStrategy'
+          top_p: '#/components/schemas/TopPSamplingStrategy'
+          top_k: '#/components/schemas/TopKSamplingStrategy'
+    ToolConfig:
+      type: object
+      properties:
+        tool_choice:
+          oneOf:
+            - type: string
+              enum:
+                - auto
+                - required
+                - none
+              title: ToolChoice
+              description: >-
+                Whether tool use is required or automatic. This is a hint to the model
+                which may not be followed. It depends on the Instruction Following
+                capabilities of the model.
+            - type: string
+          default: auto
+          description: >-
+            (Optional) Whether tool use is automatic, required, or none. Can also
+            specify a tool name to use a specific tool. Defaults to ToolChoice.auto.
+        tool_prompt_format:
+          type: string
+          enum:
+            - json
+            - function_tag
+            - python_list
+          description: >-
+            (Optional) Instructs the model how to format tool calls. By default, Llama
+            Stack will attempt to use a format that is best adapted to the model.
+            - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
+            - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name>
+            tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
+            syntax -- a list of function calls.
+        system_message_behavior:
+          type: string
+          enum:
+            - append
+            - replace
+          description: >-
+            (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
+            Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
+            Replaces the default system prompt with the provided system message. The
+            system message can include the string '{{function_definitions}}' to indicate
+            where the function definitions should be inserted.
+          default: append
+      additionalProperties: false
+      title: ToolConfig
+      description: Configuration for tool use.
     ToolDef:
       type: object
       properties:
@@ -4871,6 +3872,51 @@ components:
         - required
       title: ToolParameter
       description: Parameter definition for a tool.
+    TopKSamplingStrategy:
+      type: object
+      properties:
+        type:
+          type: string
+          const: top_k
+          default: top_k
+          description: >-
+            Must be "top_k" to identify this sampling strategy
+        top_k:
+          type: integer
+          description: >-
+            Number of top tokens to consider for sampling. Must be at least 1
+      additionalProperties: false
+      required:
+        - type
+        - top_k
+      title: TopKSamplingStrategy
+      description: >-
+        Top-k sampling strategy that restricts sampling to the k most likely tokens.
+    TopPSamplingStrategy:
+      type: object
+      properties:
+        type:
+          type: string
+          const: top_p
+          default: top_p
+          description: >-
+            Must be "top_p" to identify this sampling strategy
+        temperature:
+          type: number
+          description: >-
+            Controls randomness in sampling. Higher values increase randomness
+        top_p:
+          type: number
+          default: 0.95
+          description: >-
+            Cumulative probability threshold for nucleus sampling. Defaults to 0.95
+      additionalProperties: false
+      required:
+        - type
+      title: TopPSamplingStrategy
+      description: >-
+        Top-p (nucleus) sampling strategy that samples from the smallest set of tokens
+        with cumulative probability >= p.
     CreateAgentRequest:
       type: object
       properties:
@@ -4916,6 +3962,130 @@ components:
       title: AgentSessionCreateResponse
       description: >-
         Response returned when creating a new agent session.
+    ImageContentItem:
+      type: object
+      properties:
+        type:
+          type: string
+          const: image
+          default: image
+          description: >-
+            Discriminator type of the content item. Always "image"
+        image:
+          type: object
+          properties:
+            url:
+              $ref: '#/components/schemas/URL'
+              description: >-
+                A URL of the image or data URL in the format of data:image/{type};base64,{data}.
+                Note that URL could have length limits.
+            data:
+              type: string
+              contentEncoding: base64
+              description: base64 encoded image data as string
+          additionalProperties: false
+          description: >-
+            Image as a base64 encoded string or an URL
+      additionalProperties: false
+      required:
+        - type
+        - image
+      title: ImageContentItem
+      description: A image content item
+    InterleavedContent:
+      oneOf:
+        - type: string
+        - $ref: '#/components/schemas/InterleavedContentItem'
+        - type: array
+          items:
+            $ref: '#/components/schemas/InterleavedContentItem'
+    InterleavedContentItem:
+      oneOf:
+        - $ref: '#/components/schemas/ImageContentItem'
+        - $ref: '#/components/schemas/TextContentItem'
+      discriminator:
+        propertyName: type
+        mapping:
+          image: '#/components/schemas/ImageContentItem'
+          text: '#/components/schemas/TextContentItem'
+    TextContentItem:
+      type: object
+      properties:
+        type:
+          type: string
+          const: text
+          default: text
+          description: >-
+            Discriminator type of the content item. Always "text"
+        text:
+          type: string
+          description: Text content
+      additionalProperties: false
+      required:
+        - type
+        - text
+      title: TextContentItem
+      description: A text content item
+    ToolResponseMessage:
+      type: object
+      properties:
+        role:
+          type: string
+          const: tool
+          default: tool
+          description: >-
+            Must be "tool" to identify this as a tool response
+        call_id:
+          type: string
+          description: >-
+            Unique identifier for the tool call this response is for
+        content:
+          $ref: '#/components/schemas/InterleavedContent'
+          description: The response content from the tool
+      additionalProperties: false
+      required:
+        - role
+        - call_id
+        - content
+      title: ToolResponseMessage
+      description: >-
+        A message representing the result of a tool invocation.
+    URL:
+      type: object
+      properties:
+        uri:
+          type: string
+          description: The URL string pointing to the resource
+      additionalProperties: false
+      required:
+        - uri
+      title: URL
+      description: A URL reference to external content.
+    UserMessage:
+      type: object
+      properties:
+        role:
+          type: string
+          const: user
+          default: user
+          description: >-
+            Must be "user" to identify this as a user message
+        content:
+          $ref: '#/components/schemas/InterleavedContent'
+          description: >-
+            The content of the message, which can include text and other media
+        context:
+          $ref: '#/components/schemas/InterleavedContent'
+          description: >-
+            (Optional) This field is used internally by Llama Stack to pass RAG context.
+            This field may be removed in the API in the future.
+      additionalProperties: false
+      required:
+        - role
+        - content
+      title: UserMessage
+      description: >-
+        A message from the user in a chat conversation.
     CreateAgentTurnRequest:
       type: object
       properties:
@@ -4972,6 +4142,45 @@ components:
       required:
         - messages
       title: CreateAgentTurnRequest
+    CompletionMessage:
+      type: object
+      properties:
+        role:
+          type: string
+          const: assistant
+          default: assistant
+          description: >-
+            Must be "assistant" to identify this as the model's response
+        content:
+          $ref: '#/components/schemas/InterleavedContent'
+          description: The content of the model's response
+        stop_reason:
+          type: string
+          enum:
+            - end_of_turn
+            - end_of_message
+            - out_of_tokens
+          description: >-
+            Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`:
+            The model finished generating the entire response. - `StopReason.end_of_message`:
+            The model finished generating but generated a partial response -- usually,
+            a tool call. The user may call the tool and continue the conversation
+            with the tool's response. - `StopReason.out_of_tokens`: The model ran
+            out of token budget.
+        tool_calls:
+          type: array
+          items:
+            $ref: '#/components/schemas/ToolCall'
+          description: >-
+            List of tool calls. Each tool call is a ToolCall object.
+      additionalProperties: false
+      required:
+        - role
+        - content
+        - stop_reason
+      title: CompletionMessage
+      description: >-
+        A message containing the model's (assistant) response in a chat conversation.
     InferenceStep:
       type: object
       properties:
@@ -5125,6 +4334,56 @@ components:
         - step_type
       title: ShieldCallStep
       description: A shield call step in an agent turn.
+    ToolCall:
+      type: object
+      properties:
+        call_id:
+          type: string
+        tool_name:
+          oneOf:
+            - type: string
+              enum:
+                - brave_search
+                - wolfram_alpha
+                - photogen
+                - code_interpreter
+              title: BuiltinTool
+            - type: string
+        arguments:
+          oneOf:
+            - type: string
+            - type: object
+              additionalProperties:
+                oneOf:
+                  - type: string
+                  - type: integer
+                  - type: number
+                  - type: boolean
+                  - type: 'null'
+                  - type: array
+                    items:
+                      oneOf:
+                        - type: string
+                        - type: integer
+                        - type: number
+                        - type: boolean
+                        - type: 'null'
+                  - type: object
+                    additionalProperties:
+                      oneOf:
+                        - type: string
+                        - type: integer
+                        - type: number
+                        - type: boolean
+                        - type: 'null'
+        arguments_json:
+          type: string
+      additionalProperties: false
+      required:
+        - call_id
+        - tool_name
+        - arguments
+      title: ToolCall
     ToolExecutionStep:
       type: object
       properties:
@@ -5563,6 +4822,87 @@ components:
       title: AgentTurnResponseTurnStartPayload
       description: >-
         Payload for turn start events in agent turn responses.
+    ContentDelta:
+      oneOf:
+        - $ref: '#/components/schemas/TextDelta'
+        - $ref: '#/components/schemas/ImageDelta'
+        - $ref: '#/components/schemas/ToolCallDelta'
+      discriminator:
+        propertyName: type
+        mapping:
+          text: '#/components/schemas/TextDelta'
+          image: '#/components/schemas/ImageDelta'
+          tool_call: '#/components/schemas/ToolCallDelta'
+    ImageDelta:
+      type: object
+      properties:
+        type:
+          type: string
+          const: image
+          default: image
+          description: >-
+            Discriminator type of the delta. Always "image"
+        image:
+          type: string
+          contentEncoding: base64
+          description: The incremental image data as bytes
+      additionalProperties: false
+      required:
+        - type
+        - image
+      title: ImageDelta
+      description: >-
+        An image content delta for streaming responses.
+    TextDelta:
+      type: object
+      properties:
+        type:
+          type: string
+          const: text
+          default: text
+          description: >-
+            Discriminator type of the delta. Always "text"
+        text:
+          type: string
+          description: The incremental text content
+      additionalProperties: false
+      required:
+        - type
+        - text
+      title: TextDelta
+      description: >-
+        A text content delta for streaming responses.
+    ToolCallDelta:
+      type: object
+      properties:
+        type:
+          type: string
+          const: tool_call
+          default: tool_call
+          description: >-
+            Discriminator type of the delta. Always "tool_call"
+        tool_call:
+          oneOf:
+            - type: string
+            - $ref: '#/components/schemas/ToolCall'
+          description: >-
+            Either an in-progress tool call string or the final parsed tool call
+        parse_status:
+          type: string
+          enum:
+            - started
+            - in_progress
+            - failed
+            - succeeded
+          description: Current parsing status of the tool call
+      additionalProperties: false
+      required:
+        - type
+        - tool_call
+        - parse_status
+      title: ToolCallDelta
+      description: >-
+        A tool call content delta for streaming responses.
     OpenAIResponseAnnotationCitation:
       type: object
       properties:
@@ -7173,72 +6513,6 @@ components:
       title: OpenAIDeleteResponseObject
       description: >-
         Response object confirming deletion of an OpenAI response.
-    EmbeddingsRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: >-
-            The identifier of the model to use. The model must be an embedding model
-            registered with Llama Stack and available via the /models endpoint.
-        contents:
-          oneOf:
-            - type: array
-              items:
-                type: string
-            - type: array
-              items:
-                $ref: '#/components/schemas/InterleavedContentItem'
-          description: >-
-            List of contents to generate embeddings for. Each content can be a string
-            or an InterleavedContentItem (and hence can be multimodal). The behavior
-            depends on the model and provider. Some models may only support text.
-        text_truncation:
-          type: string
-          enum:
-            - none
-            - start
-            - end
-          description: >-
-            (Optional) Config for how to truncate text for embedding when text is
-            longer than the model's max sequence length.
-        output_dimension:
-          type: integer
-          description: >-
-            (Optional) Output dimensionality for the embeddings. Only supported by
-            Matryoshka models.
-        task_type:
-          type: string
-          enum:
-            - query
-            - document
-          description: >-
-            (Optional) How is the embedding being used? This is only supported by
-            asymmetric embedding models.
-      additionalProperties: false
-      required:
-        - model_id
-        - contents
-      title: EmbeddingsRequest
-    EmbeddingsResponse:
-      type: object
-      properties:
-        embeddings:
-          type: array
-          items:
-            type: array
-            items:
-              type: number
-          description: >-
-            List of embedding vectors, one per input content. Each embedding is a
-            list of floats. The dimensionality of the embedding is model-specific;
-            you can check model metadata using /models/{model_id}
-      additionalProperties: false
-      required:
-        - embeddings
-      title: EmbeddingsResponse
-      description: >-
-        Response containing generated embeddings.
     AgentCandidate:
       type: object
       properties:
@@ -7435,6 +6709,28 @@ components:
       title: ScoringFnParamsType
       description: >-
         Types of scoring function parameter configurations.
+    SystemMessage:
+      type: object
+      properties:
+        role:
+          type: string
+          const: system
+          default: system
+          description: >-
+            Must be "system" to identify this as a system message
+        content:
+          $ref: '#/components/schemas/InterleavedContent'
+          description: >-
+            The content of the "system prompt". If multiple system messages are provided,
+            they are concatenated. The underlying Llama Stack code may also add other
+            system messages (for example, for formatting tool definitions).
+      additionalProperties: false
+      required:
+        - role
+        - content
+      title: SystemMessage
+      description: >-
+        A system message providing instructions or context to the model.
     EvaluateRowsRequest:
       type: object
       properties:
@@ -12571,6 +11867,19 @@ components:
         - metadata
       title: ModerationObjectResults
       description: A moderation object.
+    Message:
+      oneOf:
+        - $ref: '#/components/schemas/UserMessage'
+        - $ref: '#/components/schemas/SystemMessage'
+        - $ref: '#/components/schemas/ToolResponseMessage'
+        - $ref: '#/components/schemas/CompletionMessage'
+      discriminator:
+        propertyName: role
+        mapping:
+          user: '#/components/schemas/UserMessage'
+          system: '#/components/schemas/SystemMessage'
+          tool: '#/components/schemas/ToolResponseMessage'
+          assistant: '#/components/schemas/CompletionMessage'
     RunShieldRequest:
       type: object
       properties:
@@ -12995,18 +12304,6 @@ tags:
       the RAG Tool and Vector IO APIs for more details.
     x-displayName: >-
       Agents API for creating and interacting with agentic systems.
-  - name: BatchInference (Coming Soon)
-    description: >-
-      This is an asynchronous API. If the request is successful, the response will
-      be a job which can be polled for completion.
-
-
-      NOTE: This API is not yet implemented and is subject to change in concert with
-      other asynchronous APIs
-
-      including (post-training, evals, etc).
-    x-displayName: >-
-      Batch inference API for generating completions and chat completions.
   - name: Benchmarks
   - name: DatasetIO
   - name: Datasets
@@ -13046,7 +12343,6 @@ x-tagGroups:
   - name: Operations
     tags:
       - Agents
-      - BatchInference (Coming Soon)
       - Benchmarks
       - DatasetIO
       - Datasets
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index bd4737ca7..8e3b25b29 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -1026,7 +1026,6 @@ class InferenceProvider(Protocol):
 
     model_store: ModelStore | None = None
 
-    @webmethod(route="/inference/completion", method="POST")
     async def completion(
         self,
         model_id: str,
@@ -1049,7 +1048,6 @@ class InferenceProvider(Protocol):
         """
         ...
 
-    @webmethod(route="/inference/batch-completion", method="POST", experimental=True)
     async def batch_completion(
         self,
         model_id: str,
@@ -1070,7 +1068,6 @@ class InferenceProvider(Protocol):
         raise NotImplementedError("Batch completion is not implemented")
         return  # this is so mypy's safe-super rule will consider the method concrete
 
-    @webmethod(route="/inference/chat-completion", method="POST")
     async def chat_completion(
         self,
         model_id: str,
@@ -1110,7 +1107,6 @@ class InferenceProvider(Protocol):
         """
         ...
 
-    @webmethod(route="/inference/batch-chat-completion", method="POST", experimental=True)
     async def batch_chat_completion(
         self,
         model_id: str,
@@ -1135,7 +1131,6 @@ class InferenceProvider(Protocol):
         raise NotImplementedError("Batch chat completion is not implemented")
         return  # this is so mypy's safe-super rule will consider the method concrete
 
-    @webmethod(route="/inference/embeddings", method="POST")
     async def embeddings(
         self,
         model_id: str,
diff --git a/tests/integration/inference/test_batch_inference.py b/tests/integration/inference/test_batch_inference.py
deleted file mode 100644
index 9a1a62ce0..000000000
--- a/tests/integration/inference/test_batch_inference.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-import pytest
-
-from ..test_cases.test_case import TestCase
-
-
-def skip_if_provider_doesnt_support_batch_inference(client_with_models, model_id):
-    models = {m.identifier: m for m in client_with_models.models.list()}
-    models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
-    provider_id = models[model_id].provider_id
-    providers = {p.provider_id: p for p in client_with_models.providers.list()}
-    provider = providers[provider_id]
-    if provider.provider_type not in ("inline::meta-reference",):
-        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support batch inference")
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:batch_completion",
-    ],
-)
-def test_batch_completion_non_streaming(client_with_models, text_model_id, test_case):
-    skip_if_provider_doesnt_support_batch_inference(client_with_models, text_model_id)
-    tc = TestCase(test_case)
-
-    content_batch = tc["contents"]
-    response = client_with_models.inference.batch_completion(
-        content_batch=content_batch,
-        model_id=text_model_id,
-        sampling_params={
-            "max_tokens": 50,
-        },
-    )
-    assert len(response.batch) == len(content_batch)
-    for i, r in enumerate(response.batch):
-        print(f"response {i}: {r.content}")
-        assert len(r.content) > 10
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:batch_completion",
-    ],
-)
-def test_batch_chat_completion_non_streaming(client_with_models, text_model_id, test_case):
-    skip_if_provider_doesnt_support_batch_inference(client_with_models, text_model_id)
-    tc = TestCase(test_case)
-    qa_pairs = tc["qa_pairs"]
-
-    message_batch = [
-        [
-            {
-                "role": "user",
-                "content": qa["question"],
-            }
-        ]
-        for qa in qa_pairs
-    ]
-
-    response = client_with_models.inference.batch_chat_completion(
-        messages_batch=message_batch,
-        model_id=text_model_id,
-    )
-    assert len(response.batch) == len(qa_pairs)
-    for i, r in enumerate(response.batch):
-        print(f"response {i}: {r.completion_message.content}")
-        assert len(r.completion_message.content) > 0
-        assert qa_pairs[i]["answer"].lower() in r.completion_message.content.lower()
diff --git a/tests/integration/inference/test_embedding.py b/tests/integration/inference/test_embedding.py
deleted file mode 100644
index e592a6b14..000000000
--- a/tests/integration/inference/test_embedding.py
+++ /dev/null
@@ -1,303 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-#
-# Test plan:
-#
-#  Types of input:
-#   - array of a string
-#   - array of a image (ImageContentItem, either URL or base64 string)
-#   - array of a text (TextContentItem)
-#  Types of output:
-#   - list of list of floats
-#  Params:
-#   - text_truncation
-#     - absent w/ long text -> error
-#     - none w/ long text -> error
-#     - absent w/ short text -> ok
-#     - none w/ short text -> ok
-#     - end w/ long text -> ok
-#     - end w/ short text -> ok
-#     - start w/ long text -> ok
-#     - start w/ short text -> ok
-#   - output_dimension
-#     - response dimension matches
-#   - task_type, only for asymmetric models
-#     - query embedding != passage embedding
-#  Negative:
-#   - long string
-#   - long text
-#
-# Todo:
-#  - negative tests
-#    - empty
-#      - empty list
-#      - empty string
-#      - empty text
-#      - empty image
-#    - long
-#      - large image
-#      - appropriate combinations
-#    - batch size
-#      - many inputs
-#    - invalid
-#      - invalid URL
-#      - invalid base64
-#
-# Notes:
-#  - use llama_stack_client fixture
-#  - use pytest.mark.parametrize when possible
-#  - no accuracy tests: only check the type of output, not the content
-#
-
-import pytest
-from llama_stack_client import BadRequestError as LlamaStackBadRequestError
-from llama_stack_client.types import EmbeddingsResponse
-from llama_stack_client.types.shared.interleaved_content import (
-    ImageContentItem,
-    ImageContentItemImage,
-    ImageContentItemImageURL,
-    TextContentItem,
-)
-from openai import BadRequestError as OpenAIBadRequestError
-
-from llama_stack.core.library_client import LlamaStackAsLibraryClient
-
-DUMMY_STRING = "hello"
-DUMMY_STRING2 = "world"
-DUMMY_LONG_STRING = "NVDA " * 10240
-DUMMY_TEXT = TextContentItem(text=DUMMY_STRING, type="text")
-DUMMY_TEXT2 = TextContentItem(text=DUMMY_STRING2, type="text")
-DUMMY_LONG_TEXT = TextContentItem(text=DUMMY_LONG_STRING, type="text")
-# TODO(mf): add a real image URL and base64 string
-DUMMY_IMAGE_URL = ImageContentItem(
-    image=ImageContentItemImage(url=ImageContentItemImageURL(uri="https://example.com/image.jpg")), type="image"
-)
-DUMMY_IMAGE_BASE64 = ImageContentItem(image=ImageContentItemImage(data="base64string"), type="image")
-SUPPORTED_PROVIDERS = {"remote::nvidia"}
-MODELS_SUPPORTING_MEDIA = {}
-MODELS_SUPPORTING_OUTPUT_DIMENSION = {"nvidia/llama-3.2-nv-embedqa-1b-v2"}
-MODELS_REQUIRING_TASK_TYPE = {
-    "nvidia/llama-3.2-nv-embedqa-1b-v2",
-    "nvidia/nv-embedqa-e5-v5",
-    "nvidia/nv-embedqa-mistral-7b-v2",
-    "snowflake/arctic-embed-l",
-}
-MODELS_SUPPORTING_TASK_TYPE = MODELS_REQUIRING_TASK_TYPE
-
-
-def default_task_type(model_id):
-    """
-    Some models require a task type parameter. This provides a default value for
-    testing those models.
-    """
-    if model_id in MODELS_REQUIRING_TASK_TYPE:
-        return {"task_type": "query"}
-    return {}
-
-
-@pytest.mark.parametrize(
-    "contents",
-    [
-        [DUMMY_STRING, DUMMY_STRING2],
-        [DUMMY_TEXT, DUMMY_TEXT2],
-    ],
-    ids=[
-        "list[string]",
-        "list[text]",
-    ],
-)
-def test_embedding_text(llama_stack_client, embedding_model_id, contents, inference_provider_type):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id, contents=contents, **default_task_type(embedding_model_id)
-    )
-    assert isinstance(response, EmbeddingsResponse)
-    assert len(response.embeddings) == sum(len(content) if isinstance(content, list) else 1 for content in contents)
-    assert isinstance(response.embeddings[0], list)
-    assert isinstance(response.embeddings[0][0], float)
-
-
-@pytest.mark.parametrize(
-    "contents",
-    [
-        [DUMMY_IMAGE_URL, DUMMY_IMAGE_BASE64],
-        [DUMMY_IMAGE_URL, DUMMY_STRING, DUMMY_IMAGE_BASE64, DUMMY_TEXT],
-    ],
-    ids=[
-        "list[url,base64]",
-        "list[url,string,base64,text]",
-    ],
-)
-def test_embedding_image(llama_stack_client, embedding_model_id, contents, inference_provider_type):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    if embedding_model_id not in MODELS_SUPPORTING_MEDIA:
-        pytest.xfail(f"{embedding_model_id} doesn't support media")
-    response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id, contents=contents, **default_task_type(embedding_model_id)
-    )
-    assert isinstance(response, EmbeddingsResponse)
-    assert len(response.embeddings) == sum(len(content) if isinstance(content, list) else 1 for content in contents)
-    assert isinstance(response.embeddings[0], list)
-    assert isinstance(response.embeddings[0][0], float)
-
-
-@pytest.mark.parametrize(
-    "text_truncation",
-    [
-        "end",
-        "start",
-    ],
-)
-@pytest.mark.parametrize(
-    "contents",
-    [
-        [DUMMY_LONG_TEXT],
-        [DUMMY_STRING],
-    ],
-    ids=[
-        "long",
-        "short",
-    ],
-)
-def test_embedding_truncation(
-    llama_stack_client, embedding_model_id, text_truncation, contents, inference_provider_type
-):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id,
-        contents=contents,
-        text_truncation=text_truncation,
-        **default_task_type(embedding_model_id),
-    )
-    assert isinstance(response, EmbeddingsResponse)
-    assert len(response.embeddings) == 1
-    assert isinstance(response.embeddings[0], list)
-    assert isinstance(response.embeddings[0][0], float)
-
-
-@pytest.mark.parametrize(
-    "text_truncation",
-    [
-        None,
-        "none",
-    ],
-)
-@pytest.mark.parametrize(
-    "contents",
-    [
-        [DUMMY_LONG_TEXT],
-        [DUMMY_LONG_STRING],
-    ],
-    ids=[
-        "long-text",
-        "long-str",
-    ],
-)
-def test_embedding_truncation_error(
-    llama_stack_client, embedding_model_id, text_truncation, contents, inference_provider_type
-):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    # Using LlamaStackClient from llama_stack_client will raise llama_stack_client.BadRequestError
-    # While using LlamaStackAsLibraryClient from llama_stack.distribution.library_client will raise the error that the backend raises
-    error_type = (
-        OpenAIBadRequestError
-        if isinstance(llama_stack_client, LlamaStackAsLibraryClient)
-        else LlamaStackBadRequestError
-    )
-    with pytest.raises(error_type):
-        llama_stack_client.inference.embeddings(
-            model_id=embedding_model_id,
-            contents=[DUMMY_LONG_TEXT],
-            text_truncation=text_truncation,
-            **default_task_type(embedding_model_id),
-        )
-
-
-def test_embedding_output_dimension(llama_stack_client, embedding_model_id, inference_provider_type):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    if embedding_model_id not in MODELS_SUPPORTING_OUTPUT_DIMENSION:
-        pytest.xfail(f"{embedding_model_id} doesn't support output_dimension")
-    base_response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id, contents=[DUMMY_STRING], **default_task_type(embedding_model_id)
-    )
-    test_response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id,
-        contents=[DUMMY_STRING],
-        **default_task_type(embedding_model_id),
-        output_dimension=32,
-    )
-    assert len(base_response.embeddings[0]) != len(test_response.embeddings[0])
-    assert len(test_response.embeddings[0]) == 32
-
-
-def test_embedding_task_type(llama_stack_client, embedding_model_id, inference_provider_type):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    if embedding_model_id not in MODELS_SUPPORTING_TASK_TYPE:
-        pytest.xfail(f"{embedding_model_id} doesn't support task_type")
-    query_embedding = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id, contents=[DUMMY_STRING], task_type="query"
-    )
-    document_embedding = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id, contents=[DUMMY_STRING], task_type="document"
-    )
-    assert query_embedding.embeddings != document_embedding.embeddings
-
-
-@pytest.mark.parametrize(
-    "text_truncation",
-    [
-        None,
-        "none",
-        "end",
-        "start",
-    ],
-)
-def test_embedding_text_truncation(llama_stack_client, embedding_model_id, text_truncation, inference_provider_type):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id,
-        contents=[DUMMY_STRING],
-        text_truncation=text_truncation,
-        **default_task_type(embedding_model_id),
-    )
-    assert isinstance(response, EmbeddingsResponse)
-    assert len(response.embeddings) == 1
-    assert isinstance(response.embeddings[0], list)
-    assert isinstance(response.embeddings[0][0], float)
-
-
-@pytest.mark.parametrize(
-    "text_truncation",
-    [
-        "NONE",
-        "END",
-        "START",
-        "left",
-        "right",
-    ],
-)
-def test_embedding_text_truncation_error(
-    llama_stack_client, embedding_model_id, text_truncation, inference_provider_type
-):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    error_type = ValueError if isinstance(llama_stack_client, LlamaStackAsLibraryClient) else LlamaStackBadRequestError
-    with pytest.raises(error_type):
-        llama_stack_client.inference.embeddings(
-            model_id=embedding_model_id,
-            contents=[DUMMY_STRING],
-            text_truncation=text_truncation,
-            **default_task_type(embedding_model_id),
-        )
diff --git a/tests/integration/inference/test_text_inference.py b/tests/integration/inference/test_text_inference.py
deleted file mode 100644
index d7ffe5929..000000000
--- a/tests/integration/inference/test_text_inference.py
+++ /dev/null
@@ -1,543 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from time import sleep
-
-import pytest
-from pydantic import BaseModel
-
-from llama_stack.models.llama.sku_list import resolve_model
-
-from ..test_cases.test_case import TestCase
-
-PROVIDER_LOGPROBS_TOP_K = {"remote::together", "remote::fireworks", "remote::vllm"}
-
-
-def skip_if_model_doesnt_support_completion(client_with_models, model_id):
-    models = {m.identifier: m for m in client_with_models.models.list()}
-    models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
-    provider_id = models[model_id].provider_id
-    providers = {p.provider_id: p for p in client_with_models.providers.list()}
-    provider = providers[provider_id]
-    if (
-        provider.provider_type
-        in (
-            "remote::openai",
-            "remote::anthropic",
-            "remote::gemini",
-            "remote::vertexai",
-            "remote::groq",
-            "remote::sambanova",
-        )
-        or "openai-compat" in provider.provider_type
-    ):
-        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support completion")
-
-
-def skip_if_model_doesnt_support_json_schema_structured_output(client_with_models, model_id):
-    models = {m.identifier: m for m in client_with_models.models.list()}
-    models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
-    provider_id = models[model_id].provider_id
-    providers = {p.provider_id: p for p in client_with_models.providers.list()}
-    provider = providers[provider_id]
-    if provider.provider_type in ("remote::sambanova",):
-        pytest.skip(
-            f"Model {model_id} hosted by {provider.provider_type} doesn't support json_schema structured output"
-        )
-
-
-def get_llama_model(client_with_models, model_id):
-    models = {}
-    for m in client_with_models.models.list():
-        models[m.identifier] = m
-        models[m.provider_resource_id] = m
-
-    assert model_id in models, f"Model {model_id} not found"
-
-    model = models[model_id]
-    ids = (model.identifier, model.provider_resource_id)
-    for mid in ids:
-        if resolve_model(mid):
-            return mid
-
-    return model.metadata.get("llama_model", None)
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:sanity",
-    ],
-)
-def test_text_completion_non_streaming(client_with_models, text_model_id, test_case):
-    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.completion(
-        content=tc["content"],
-        stream=False,
-        model_id=text_model_id,
-        sampling_params={
-            "max_tokens": 50,
-        },
-    )
-    assert len(response.content) > 10
-    # assert "blue" in response.content.lower().strip()
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:sanity",
-    ],
-)
-def test_text_completion_streaming(client_with_models, text_model_id, test_case):
-    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.completion(
-        content=tc["content"],
-        stream=True,
-        model_id=text_model_id,
-        sampling_params={
-            "max_tokens": 50,
-        },
-    )
-    streamed_content = [chunk.delta for chunk in response]
-    content_str = "".join(streamed_content).lower().strip()
-    # assert "blue" in content_str
-    assert len(content_str) > 10
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:stop_sequence",
-    ],
-)
-def test_text_completion_stop_sequence(client_with_models, text_model_id, inference_provider_type, test_case):
-    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
-    # This is only supported/tested for remote vLLM: https://github.com/meta-llama/llama-stack/issues/1771
-    if inference_provider_type != "remote::vllm":
-        pytest.xfail(f"{inference_provider_type} doesn't support 'stop' parameter yet")
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.completion(
-        content=tc["content"],
-        stream=True,
-        model_id=text_model_id,
-        sampling_params={
-            "max_tokens": 50,
-            "stop": ["1963"],
-        },
-    )
-    streamed_content = [chunk.delta for chunk in response]
-    content_str = "".join(streamed_content).lower().strip()
-    assert "1963" not in content_str
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:log_probs",
-    ],
-)
-def test_text_completion_log_probs_non_streaming(client_with_models, text_model_id, inference_provider_type, test_case):
-    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
-    if inference_provider_type not in PROVIDER_LOGPROBS_TOP_K:
-        pytest.xfail(f"{inference_provider_type} doesn't support log probs yet")
-
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.completion(
-        content=tc["content"],
-        stream=False,
-        model_id=text_model_id,
-        sampling_params={
-            "max_tokens": 5,
-        },
-        logprobs={
-            "top_k": 1,
-        },
-    )
-    assert response.logprobs, "Logprobs should not be empty"
-    assert 1 <= len(response.logprobs) <= 5  # each token has 1 logprob and here max_tokens=5
-    assert all(len(logprob.logprobs_by_token) == 1 for logprob in response.logprobs)
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:log_probs",
-    ],
-)
-def test_text_completion_log_probs_streaming(client_with_models, text_model_id, inference_provider_type, test_case):
-    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
-    if inference_provider_type not in PROVIDER_LOGPROBS_TOP_K:
-        pytest.xfail(f"{inference_provider_type} doesn't support log probs yet")
-
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.completion(
-        content=tc["content"],
-        stream=True,
-        model_id=text_model_id,
-        sampling_params={
-            "max_tokens": 5,
-        },
-        logprobs={
-            "top_k": 1,
-        },
-    )
-    streamed_content = list(response)
-    for chunk in streamed_content:
-        if chunk.delta:  # if there's a token, we expect logprobs
-            assert chunk.logprobs, "Logprobs should not be empty"
-            assert all(len(logprob.logprobs_by_token) == 1 for logprob in chunk.logprobs)
-        else:  # no token, no logprobs
-            assert not chunk.logprobs, "Logprobs should be empty"
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:structured_output",
-    ],
-)
-def test_text_completion_structured_output(client_with_models, text_model_id, test_case):
-    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
-
-    class AnswerFormat(BaseModel):
-        name: str
-        year_born: str
-        year_retired: str
-
-    tc = TestCase(test_case)
-
-    user_input = tc["user_input"]
-    response = client_with_models.inference.completion(
-        model_id=text_model_id,
-        content=user_input,
-        stream=False,
-        sampling_params={
-            "max_tokens": 50,
-        },
-        response_format={
-            "type": "json_schema",
-            "json_schema": AnswerFormat.model_json_schema(),
-        },
-    )
-    answer = AnswerFormat.model_validate_json(response.content)
-    expected = tc["expected"]
-    assert answer.name == expected["name"]
-    assert answer.year_born == expected["year_born"]
-    assert answer.year_retired == expected["year_retired"]
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:non_streaming_01",
-        "inference:chat_completion:non_streaming_02",
-    ],
-)
-def test_text_chat_completion_non_streaming(client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-    question = tc["question"]
-    expected = tc["expected"]
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=[
-            {
-                "role": "user",
-                "content": question,
-            }
-        ],
-        stream=False,
-    )
-    message_content = response.completion_message.content.lower().strip()
-    assert len(message_content) > 0
-    assert expected.lower() in message_content
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:streaming_01",
-        "inference:chat_completion:streaming_02",
-    ],
-)
-def test_text_chat_completion_streaming(client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-    question = tc["question"]
-    expected = tc["expected"]
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=[{"role": "user", "content": question}],
-        stream=True,
-        timeout=120,  # Increase timeout to 2 minutes for large conversation history
-    )
-    streamed_content = [str(chunk.event.delta.text.lower().strip()) for chunk in response]
-    assert len(streamed_content) > 0
-    assert expected.lower() in "".join(streamed_content)
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:tool_calling",
-    ],
-)
-def test_text_chat_completion_with_tool_calling_and_non_streaming(client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=tc["messages"],
-        tools=tc["tools"],
-        tool_choice="auto",
-        stream=False,
-    )
-    # some models can return content for the response in addition to the tool call
-    assert response.completion_message.role == "assistant"
-
-    assert len(response.completion_message.tool_calls) == 1
-    assert response.completion_message.tool_calls[0].tool_name == tc["tools"][0]["tool_name"]
-    assert response.completion_message.tool_calls[0].arguments == tc["expected"]
-
-
-# Will extract streamed text and separate it from tool invocation content
-# The returned tool inovcation content will be a string so it's easy to comapare with expected value
-# e.g. "[get_weather, {'location': 'San Francisco, CA'}]"
-def extract_tool_invocation_content(response):
-    tool_invocation_content: str = ""
-    for chunk in response:
-        delta = chunk.event.delta
-        if delta.type == "tool_call" and delta.parse_status == "succeeded":
-            call = delta.tool_call
-            tool_invocation_content += f"[{call.tool_name}, {call.arguments}]"
-    return tool_invocation_content
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:tool_calling",
-    ],
-)
-def test_text_chat_completion_with_tool_calling_and_streaming(client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=tc["messages"],
-        tools=tc["tools"],
-        tool_choice="auto",
-        stream=True,
-    )
-    tool_invocation_content = extract_tool_invocation_content(response)
-    expected_tool_name = tc["tools"][0]["tool_name"]
-    expected_argument = tc["expected"]
-    assert tool_invocation_content == f"[{expected_tool_name}, {expected_argument}]"
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:tool_calling",
-    ],
-)
-def test_text_chat_completion_with_tool_choice_required(client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=tc["messages"],
-        tools=tc["tools"],
-        tool_config={
-            "tool_choice": "required",
-        },
-        stream=True,
-    )
-    tool_invocation_content = extract_tool_invocation_content(response)
-    expected_tool_name = tc["tools"][0]["tool_name"]
-    expected_argument = tc["expected"]
-    assert tool_invocation_content == f"[{expected_tool_name}, {expected_argument}]"
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:tool_calling",
-    ],
-)
-def test_text_chat_completion_with_tool_choice_none(client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=tc["messages"],
-        tools=tc["tools"],
-        tool_config={"tool_choice": "none"},
-        stream=True,
-    )
-    tool_invocation_content = extract_tool_invocation_content(response)
-    assert tool_invocation_content == ""
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:structured_output",
-    ],
-)
-def test_text_chat_completion_structured_output(client_with_models, text_model_id, test_case):
-    skip_if_model_doesnt_support_json_schema_structured_output(client_with_models, text_model_id)
-
-    class NBAStats(BaseModel):
-        year_for_draft: int
-        num_seasons_in_nba: int
-
-    class AnswerFormat(BaseModel):
-        first_name: str
-        last_name: str
-        year_of_birth: int
-        nba_stats: NBAStats
-
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=tc["messages"],
-        response_format={
-            "type": "json_schema",
-            "json_schema": AnswerFormat.model_json_schema(),
-        },
-        stream=False,
-    )
-    answer = AnswerFormat.model_validate_json(response.completion_message.content)
-    expected = tc["expected"]
-    assert answer.first_name == expected["first_name"]
-    assert answer.last_name == expected["last_name"]
-    assert answer.year_of_birth == expected["year_of_birth"]
-    assert answer.nba_stats.num_seasons_in_nba == expected["num_seasons_in_nba"]
-    assert answer.nba_stats.year_for_draft == expected["year_for_draft"]
-
-
-@pytest.mark.parametrize("streaming", [True, False])
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:tool_calling_tools_absent",
-    ],
-)
-def test_text_chat_completion_tool_calling_tools_not_in_request(
-    client_with_models, text_model_id, test_case, streaming
-):
-    tc = TestCase(test_case)
-
-    # TODO: more dynamic lookup on tool_prompt_format for model family
-    tool_prompt_format = "json" if "3.1" in text_model_id else "python_list"
-    request = {
-        "model_id": text_model_id,
-        "messages": tc["messages"],
-        "tools": tc["tools"],
-        "tool_choice": "auto",
-        "tool_prompt_format": tool_prompt_format,
-        "stream": streaming,
-    }
-
-    response = client_with_models.inference.chat_completion(**request)
-
-    if streaming:
-        for chunk in response:
-            delta = chunk.event.delta
-            if delta.type == "tool_call" and delta.parse_status == "succeeded":
-                assert delta.tool_call.tool_name == "get_object_namespace_list"
-            if delta.type == "tool_call" and delta.parse_status == "failed":
-                # expect raw message that failed to parse in tool_call
-                assert isinstance(delta.tool_call, str)
-                assert len(delta.tool_call) > 0
-    else:
-        for tc in response.completion_message.tool_calls:
-            assert tc.tool_name == "get_object_namespace_list"
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        # Tests if the model can handle simple messages like "Hi" or
-        # a message unrelated to one of the tool calls
-        "inference:chat_completion:text_then_tool",
-        # Tests if the model can do full tool call with responses correctly
-        "inference:chat_completion:tool_then_answer",
-        # Tests if model can generate multiple params and
-        # read outputs correctly
-        "inference:chat_completion:array_parameter",
-    ],
-)
-def test_text_chat_completion_with_multi_turn_tool_calling(client_with_models, text_model_id, test_case):
-    """This test tests the model's tool calling loop in various scenarios"""
-    if "llama-4" not in text_model_id.lower() and "llama4" not in text_model_id.lower():
-        pytest.xfail("Not tested for non-llama4 models yet")
-
-    tc = TestCase(test_case)
-    messages = []
-
-    # keep going until either
-    # 1. we have messages to test in multi-turn
-    # 2. no messages bust last message is tool response
-    while len(tc["messages"]) > 0 or (len(messages) > 0 and messages[-1]["role"] == "tool"):
-        # do not take new messages if last message is tool response
-        if len(messages) == 0 or messages[-1]["role"] != "tool":
-            new_messages = tc["messages"].pop(0)
-            messages += new_messages
-
-        # pprint(messages)
-        response = client_with_models.inference.chat_completion(
-            model_id=text_model_id,
-            messages=messages,
-            tools=tc["tools"],
-            stream=False,
-            sampling_params={
-                "strategy": {
-                    "type": "top_p",
-                    "top_p": 0.9,
-                    "temperature": 0.6,
-                }
-            },
-        )
-        op_msg = response.completion_message
-        messages.append(op_msg.model_dump())
-        # print(op_msg)
-
-        assert op_msg.role == "assistant"
-        expected = tc["expected"].pop(0)
-        assert len(op_msg.tool_calls) == expected["num_tool_calls"]
-
-        if expected["num_tool_calls"] > 0:
-            assert op_msg.tool_calls[0].tool_name == expected["tool_name"]
-            assert op_msg.tool_calls[0].arguments == expected["tool_arguments"]
-
-            tool_response = tc["tool_responses"].pop(0)
-            messages.append(
-                # Tool Response Message
-                {
-                    "role": "tool",
-                    "call_id": op_msg.tool_calls[0].call_id,
-                    "content": tool_response["response"],
-                }
-            )
-        else:
-            actual_answer = op_msg.content.lower()
-            # pprint(actual_answer)
-            assert expected["answer"] in actual_answer
-
-        # sleep to avoid rate limit
-        sleep(1)