Merge b1cbfe99f9 into sapling-pr-archive-ehhuang

2025-12-04 18:13:44 +00:00 · 2025-09-29 15:52:57 -07:00 · 2025-09-29 15:52:57 -07:00 · 91898e6598
commit 91898e6598
parent 9ed2425e92 b1cbfe99f9
81 changed files with 51742 additions and 2402 deletions
--- a/.github/workflows/conformance.yml
+++ b/.github/workflows/conformance.yml
@ -43,7 +43,7 @@ jobs:
      # Cache oasdiff to avoid checksum failures and speed up builds
      - name: Cache oasdiff
        id: cache-oasdiff
-        uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809
+        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830
        with:
          path: ~/oasdiff
          key: oasdiff-${{ runner.os }}
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -4,6 +4,8 @@ include llama_stack/models/llama/llama4/tokenizer.model
 include llama_stack/core/*.sh
 include llama_stack/cli/scripts/*.sh
 include llama_stack/distributions/*/*.yaml
-include llama_stack/providers/tests/test_cases/inference/*.json
+exclude llama_stack/distributions/ci-tests
+include tests/integration/test_cases/inference/*.json
 include llama_stack/models/llama/*/*.md
 include llama_stack/tests/integration/*.jpg
+prune llama_stack/distributions/ci-tests
--- a/docs/docs/references/python_sdk_reference/index.md
+++ b/docs/docs/references/python_sdk_reference/index.md
@ -139,18 +139,7 @@ Methods:
 - <code title="post /v1/agents/{agent_id}/session/{session_id}/turn">client.agents.turn.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/agents/turn.py">create</a>(session_id, \*, agent_id, \*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/agents/turn_create_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/agents/turn_create_response.py">TurnCreateResponse</a></code>
 - <code title="get /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}">client.agents.turn.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/agents/turn.py">retrieve</a>(turn_id, \*, agent_id, session_id) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/agents/turn.py">Turn</a></code>

-## BatchInference

-Types:
-
-```python
-from llama_stack_client.types import BatchInferenceChatCompletionResponse
-```
-
-Methods:
-
- <code title="post /v1/batch-inference/chat-completion">client.batch_inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/batch_inference.py">chat_completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/batch_inference_chat_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/batch_inference_chat_completion_response.py">BatchInferenceChatCompletionResponse</a></code>
- <code title="post /v1/batch-inference/completion">client.batch_inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/batch_inference.py">completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/batch_inference_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/shared/batch_completion.py">BatchCompletion</a></code>

 ## Datasets

--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -548,7 +548,6 @@ class Generator:
        if op.defining_class.__name__ in [
            "SyntheticDataGeneration",
            "PostTraining",
-            "BatchInference",
        ]:
            op.defining_class.__name__ = f"{op.defining_class.__name__} (Coming Soon)"
            print(op.defining_class.__name__)
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
@ -87,94 +87,6 @@
                }
            }
        },
-        "/v1/inference/batch-chat-completion": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "A BatchChatCompletionResponse with the full completions.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/BatchChatCompletionResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Inference"
-                ],
-                "summary": "Generate chat completions for a batch of messages using the specified model.",
-                "description": "Generate chat completions for a batch of messages using the specified model.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/BatchChatCompletionRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
-        "/v1/inference/batch-completion": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "A BatchCompletionResponse with the full completions.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/BatchCompletionResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Inference"
-                ],
-                "summary": "Generate completions for a batch of content using the specified model.",
-                "description": "Generate completions for a batch of content using the specified model.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/BatchCompletionRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
        "/v1alpha/post-training/job/cancel": {
            "post": {
                "responses": {
@ -281,7 +193,7 @@
                    }
                },
                "tags": [
-                    "BatchInference (Coming Soon)"
+                    "Inference"
                ],
                "summary": "Generate a chat completion for the given messages using the specified model.",
                "description": "Generate a chat completion for the given messages using the specified model.",
@ -298,55 +210,6 @@
                }
            }
        },
-        "/v1/inference/completion": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/CompletionResponse"
-                                }
-                            },
-                            "text/event-stream": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/CompletionResponseStreamChunk"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "BatchInference (Coming Soon)"
-                ],
-                "summary": "Generate a completion for the given content using the specified model.",
-                "description": "Generate a completion for the given content using the specified model.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/CompletionRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
        "/v1/agents": {
            "get": {
                "responses": {
@ -6346,6 +6209,20 @@
                ],
                "title": "AppendRowsRequest"
            },
+            "CancelTrainingJobRequest": {
+                "type": "object",
+                "properties": {
+                    "job_uuid": {
+                        "type": "string",
+                        "description": "The UUID of the job to cancel."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "job_uuid"
+                ],
+                "title": "CancelTrainingJobRequest"
+            },
            "CompletionMessage": {
                "type": "object",
                "properties": {
@ -6906,6 +6783,31 @@
                        "type": "boolean",
                        "default": true
                    },
+                    "items": {
+                        "oneOf": [
+                            {
+                                "type": "null"
+                            },
+                            {
+                                "type": "boolean"
+                            },
+                            {
+                                "type": "number"
+                            },
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "array"
+                            },
+                            {
+                                "type": "object"
+                            }
+                        ]
+                    },
+                    "title": {
+                        "type": "string"
+                    },
                    "default": {
                        "oneOf": [
                            {
@ -7051,26 +6953,23 @@
                "title": "UserMessage",
                "description": "A message from the user in a chat conversation."
            },
-            "BatchChatCompletionRequest": {
+            "ChatCompletionRequest": {
                "type": "object",
                "properties": {
                    "model_id": {
                        "type": "string",
                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
                    },
-                    "messages_batch": {
+                    "messages": {
                        "type": "array",
                        "items": {
-                            "type": "array",
-                            "items": {
-                                "$ref": "#/components/schemas/Message"
-                            }
+                            "$ref": "#/components/schemas/Message"
                        },
-                        "description": "The messages to generate completions for."
+                        "description": "List of messages in the conversation."
                    },
                    "sampling_params": {
                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "(Optional) Parameters to control the sampling strategy."
+                        "description": "Parameters to control the sampling strategy."
                    },
                    "tools": {
                        "type": "array",
@ -7079,13 +6978,31 @@
                        },
                        "description": "(Optional) List of tool definitions available to the model."
                    },
-                    "tool_config": {
-                        "$ref": "#/components/schemas/ToolConfig",
-                        "description": "(Optional) Configuration for tool use."
+                    "tool_choice": {
+                        "type": "string",
+                        "enum": [
+                            "auto",
+                            "required",
+                            "none"
+                        ],
+                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead."
+                    },
+                    "tool_prompt_format": {
+                        "type": "string",
+                        "enum": [
+                            "json",
+                            "function_tag",
+                            "python_list"
+                        ],
+                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead."
                    },
                    "response_format": {
                        "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding."
+                        "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
+                    },
+                    "stream": {
+                        "type": "boolean",
+                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
                    },
                    "logprobs": {
                        "type": "object",
@ -7098,32 +7015,18 @@
                        },
                        "additionalProperties": false,
                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
+                    },
+                    "tool_config": {
+                        "$ref": "#/components/schemas/ToolConfig",
+                        "description": "(Optional) Configuration for tool use."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "model_id",
-                    "messages_batch"
+                    "messages"
                ],
-                "title": "BatchChatCompletionRequest"
-            },
-            "BatchChatCompletionResponse": {
-                "type": "object",
-                "properties": {
-                    "batch": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ChatCompletionResponse"
-                        },
-                        "description": "List of chat completion responses, one for each conversation in the batch"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "batch"
-                ],
-                "title": "BatchChatCompletionResponse",
-                "description": "Response from a batch chat completion request."
+                "title": "ChatCompletionRequest"
            },
            "ChatCompletionResponse": {
                "type": "object",
@ -7203,194 +7106,6 @@
                "title": "TokenLogProbs",
                "description": "Log probabilities for generated tokens."
            },
-            "BatchCompletionRequest": {
-                "type": "object",
-                "properties": {
-                    "model_id": {
-                        "type": "string",
-                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
-                    },
-                    "content_batch": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/InterleavedContent"
-                        },
-                        "description": "The content to generate completions for."
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "(Optional) Parameters to control the sampling strategy."
-                    },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding."
-                    },
-                    "logprobs": {
-                        "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "model_id",
-                    "content_batch"
-                ],
-                "title": "BatchCompletionRequest"
-            },
-            "BatchCompletionResponse": {
-                "type": "object",
-                "properties": {
-                    "batch": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/CompletionResponse"
-                        },
-                        "description": "List of completion responses, one for each input in the batch"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "batch"
-                ],
-                "title": "BatchCompletionResponse",
-                "description": "Response from a batch completion request."
-            },
-            "CompletionResponse": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/MetricInResponse"
-                        },
-                        "description": "(Optional) List of metrics associated with the API response"
-                    },
-                    "content": {
-                        "type": "string",
-                        "description": "The generated completion text"
-                    },
-                    "stop_reason": {
-                        "type": "string",
-                        "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
-                        ],
-                        "description": "Reason why generation stopped"
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        },
-                        "description": "Optional log probabilities for generated tokens"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "content",
-                    "stop_reason"
-                ],
-                "title": "CompletionResponse",
-                "description": "Response from a completion request."
-            },
-            "CancelTrainingJobRequest": {
-                "type": "object",
-                "properties": {
-                    "job_uuid": {
-                        "type": "string",
-                        "description": "The UUID of the job to cancel."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_uuid"
-                ],
-                "title": "CancelTrainingJobRequest"
-            },
-            "ChatCompletionRequest": {
-                "type": "object",
-                "properties": {
-                    "model_id": {
-                        "type": "string",
-                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
-                    },
-                    "messages": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/Message"
-                        },
-                        "description": "List of messages in the conversation."
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "Parameters to control the sampling strategy."
-                    },
-                    "tools": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ToolDefinition"
-                        },
-                        "description": "(Optional) List of tool definitions available to the model."
-                    },
-                    "tool_choice": {
-                        "type": "string",
-                        "enum": [
-                            "auto",
-                            "required",
-                            "none"
-                        ],
-                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead."
-                    },
-                    "tool_prompt_format": {
-                        "type": "string",
-                        "enum": [
-                            "json",
-                            "function_tag",
-                            "python_list"
-                        ],
-                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead."
-                    },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
-                    },
-                    "stream": {
-                        "type": "boolean",
-                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
-                    },
-                    "logprobs": {
-                        "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
-                    },
-                    "tool_config": {
-                        "$ref": "#/components/schemas/ToolConfig",
-                        "description": "(Optional) Configuration for tool use."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "model_id",
-                    "messages"
-                ],
-                "title": "ChatCompletionRequest"
-            },
            "ChatCompletionResponseEvent": {
                "type": "object",
                "properties": {
@ -7560,87 +7275,6 @@
                "title": "ToolCallDelta",
                "description": "A tool call content delta for streaming responses."
            },
-            "CompletionRequest": {
-                "type": "object",
-                "properties": {
-                    "model_id": {
-                        "type": "string",
-                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
-                    },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The content to generate a completion for."
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "(Optional) Parameters to control the sampling strategy."
-                    },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding."
-                    },
-                    "stream": {
-                        "type": "boolean",
-                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
-                    },
-                    "logprobs": {
-                        "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "model_id",
-                    "content"
-                ],
-                "title": "CompletionRequest"
-            },
-            "CompletionResponseStreamChunk": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/MetricInResponse"
-                        },
-                        "description": "(Optional) List of metrics associated with the API response"
-                    },
-                    "delta": {
-                        "type": "string",
-                        "description": "New content generated since last chunk. This can be one or more tokens."
-                    },
-                    "stop_reason": {
-                        "type": "string",
-                        "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
-                        ],
-                        "description": "Optional reason why generation stopped, if complete"
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        },
-                        "description": "Optional log probabilities for generated tokens"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "delta"
-                ],
-                "title": "CompletionResponseStreamChunk",
-                "description": "A chunk of a streamed completion response."
-            },
            "AgentConfig": {
                "type": "object",
                "properties": {
@ -7848,6 +7482,14 @@
                        "default": true,
                        "description": "Whether this parameter is required for tool invocation"
                    },
+                    "items": {
+                        "type": "object",
+                        "description": "Type of the elements when parameter_type is array"
+                    },
+                    "title": {
+                        "type": "string",
+                        "description": "(Optional) Title of the parameter"
+                    },
                    "default": {
                        "oneOf": [
                            {
@ -18779,11 +18421,6 @@
            "description": "Main functionalities provided by this API:\n- Create agents with specific instructions and ability to use tools.\n- Interactions with agents are grouped into sessions (\"threads\"), and each interaction is called a \"turn\".\n- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).\n- Agents can be provided with various shields (see the Safety API for more details).\n- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.",
            "x-displayName": "Agents API for creating and interacting with agentic systems."
        },
-        {
-            "name": "BatchInference (Coming Soon)",
-            "description": "This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.\n\nNOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs\nincluding (post-training, evals, etc).",
-            "x-displayName": "Batch inference API for generating completions and chat completions."
-        },
        {
            "name": "Benchmarks"
        },
@ -18858,7 +18495,6 @@
            "name": "Operations",
            "tags": [
                "Agents",
-                "BatchInference (Coming Soon)",
                "Benchmarks",
                "DatasetIO",
                "Datasets",
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -43,72 +43,6 @@ paths:
            schema:
              $ref: '#/components/schemas/AppendRowsRequest'
        required: true
-  /v1/inference/batch-chat-completion:
-    post:
-      responses:
-        '200':
-          description: >-
-            A BatchChatCompletionResponse with the full completions.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/BatchChatCompletionResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Inference
-      summary: >-
-        Generate chat completions for a batch of messages using the specified model.
-      description: >-
-        Generate chat completions for a batch of messages using the specified model.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/BatchChatCompletionRequest'
-        required: true
-  /v1/inference/batch-completion:
-    post:
-      responses:
-        '200':
-          description: >-
-            A BatchCompletionResponse with the full completions.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/BatchCompletionResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Inference
-      summary: >-
-        Generate completions for a batch of content using the specified model.
-      description: >-
-        Generate completions for a batch of content using the specified model.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/BatchCompletionRequest'
-        required: true
  /v1alpha/post-training/job/cancel:
    post:
      responses:
@ -186,7 +120,7 @@ paths:
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
-        - BatchInference (Coming Soon)
+        - Inference
      summary: >-
        Generate a chat completion for the given messages using the specified model.
      description: >-
@ -198,43 +132,6 @@ paths:
            schema:
              $ref: '#/components/schemas/ChatCompletionRequest'
        required: true
-  /v1/inference/completion:
-    post:
-      responses:
-        '200':
-          description: >-
-            If stream=False, returns a CompletionResponse with the full completion.
-            If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/CompletionResponse'
-            text/event-stream:
-              schema:
-                $ref: '#/components/schemas/CompletionResponseStreamChunk'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - BatchInference (Coming Soon)
-      summary: >-
-        Generate a completion for the given content using the specified model.
-      description: >-
-        Generate a completion for the given content using the specified model.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/CompletionRequest'
-        required: true
  /v1/agents:
    get:
      responses:
@ -4559,6 +4456,16 @@ components:
      required:
        - rows
      title: AppendRowsRequest
+    CancelTrainingJobRequest:
+      type: object
+      properties:
+        job_uuid:
+          type: string
+          description: The UUID of the job to cancel.
+      additionalProperties: false
+      required:
+        - job_uuid
+      title: CancelTrainingJobRequest
    CompletionMessage:
      type: object
      properties:
@ -4959,6 +4866,16 @@ components:
        required:
          type: boolean
          default: true
+        items:
+          oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+        title:
+          type: string
        default:
          oneOf:
            - type: 'null'
@ -5076,224 +4993,6 @@ components:
      title: UserMessage
      description: >-
        A message from the user in a chat conversation.
-    BatchChatCompletionRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: >-
-            The identifier of the model to use. The model must be registered with
-            Llama Stack and available via the /models endpoint.
-        messages_batch:
-          type: array
-          items:
-            type: array
-            items:
-              $ref: '#/components/schemas/Message'
-          description: >-
-            The messages to generate completions for.
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: >-
-            (Optional) Parameters to control the sampling strategy.
-        tools:
-          type: array
-          items:
-            $ref: '#/components/schemas/ToolDefinition'
-          description: >-
-            (Optional) List of tool definitions available to the model.
-        tool_config:
-          $ref: '#/components/schemas/ToolConfig'
-          description: (Optional) Configuration for tool use.
-        response_format:
-          $ref: '#/components/schemas/ResponseFormat'
-          description: >-
-            (Optional) Grammar specification for guided (structured) decoding.
-        logprobs:
-          type: object
-          properties:
-            top_k:
-              type: integer
-              default: 0
-              description: >-
-                How many tokens (for each position) to return log probabilities for.
-          additionalProperties: false
-          description: >-
-            (Optional) If specified, log probabilities for each token position will
-            be returned.
-      additionalProperties: false
-      required:
-        - model_id
-        - messages_batch
-      title: BatchChatCompletionRequest
-    BatchChatCompletionResponse:
-      type: object
-      properties:
-        batch:
-          type: array
-          items:
-            $ref: '#/components/schemas/ChatCompletionResponse'
-          description: >-
-            List of chat completion responses, one for each conversation in the batch
-      additionalProperties: false
-      required:
-        - batch
-      title: BatchChatCompletionResponse
-      description: >-
-        Response from a batch chat completion request.
-    ChatCompletionResponse:
-      type: object
-      properties:
-        metrics:
-          type: array
-          items:
-            $ref: '#/components/schemas/MetricInResponse'
-          description: >-
-            (Optional) List of metrics associated with the API response
-        completion_message:
-          $ref: '#/components/schemas/CompletionMessage'
-          description: The complete response message
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-          description: >-
-            Optional log probabilities for generated tokens
-      additionalProperties: false
-      required:
-        - completion_message
-      title: ChatCompletionResponse
-      description: Response from a chat completion request.
-    MetricInResponse:
-      type: object
-      properties:
-        metric:
-          type: string
-          description: The name of the metric
-        value:
-          oneOf:
-            - type: integer
-            - type: number
-          description: The numeric value of the metric
-        unit:
-          type: string
-          description: >-
-            (Optional) The unit of measurement for the metric value
-      additionalProperties: false
-      required:
-        - metric
-        - value
-      title: MetricInResponse
-      description: >-
-        A metric value included in API responses.
-    TokenLogProbs:
-      type: object
-      properties:
-        logprobs_by_token:
-          type: object
-          additionalProperties:
-            type: number
-          description: >-
-            Dictionary mapping tokens to their log probabilities
-      additionalProperties: false
-      required:
-        - logprobs_by_token
-      title: TokenLogProbs
-      description: Log probabilities for generated tokens.
-    BatchCompletionRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: >-
-            The identifier of the model to use. The model must be registered with
-            Llama Stack and available via the /models endpoint.
-        content_batch:
-          type: array
-          items:
-            $ref: '#/components/schemas/InterleavedContent'
-          description: The content to generate completions for.
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: >-
-            (Optional) Parameters to control the sampling strategy.
-        response_format:
-          $ref: '#/components/schemas/ResponseFormat'
-          description: >-
-            (Optional) Grammar specification for guided (structured) decoding.
-        logprobs:
-          type: object
-          properties:
-            top_k:
-              type: integer
-              default: 0
-              description: >-
-                How many tokens (for each position) to return log probabilities for.
-          additionalProperties: false
-          description: >-
-            (Optional) If specified, log probabilities for each token position will
-            be returned.
-      additionalProperties: false
-      required:
-        - model_id
-        - content_batch
-      title: BatchCompletionRequest
-    BatchCompletionResponse:
-      type: object
-      properties:
-        batch:
-          type: array
-          items:
-            $ref: '#/components/schemas/CompletionResponse'
-          description: >-
-            List of completion responses, one for each input in the batch
-      additionalProperties: false
-      required:
-        - batch
-      title: BatchCompletionResponse
-      description: >-
-        Response from a batch completion request.
-    CompletionResponse:
-      type: object
-      properties:
-        metrics:
-          type: array
-          items:
-            $ref: '#/components/schemas/MetricInResponse'
-          description: >-
-            (Optional) List of metrics associated with the API response
-        content:
-          type: string
-          description: The generated completion text
-        stop_reason:
-          type: string
-          enum:
-            - end_of_turn
-            - end_of_message
-            - out_of_tokens
-          description: Reason why generation stopped
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-          description: >-
-            Optional log probabilities for generated tokens
-      additionalProperties: false
-      required:
-        - content
-        - stop_reason
-      title: CompletionResponse
-      description: Response from a completion request.
-    CancelTrainingJobRequest:
-      type: object
-      properties:
-        job_uuid:
-          type: string
-          description: The UUID of the job to cancel.
-      additionalProperties: false
-      required:
-        - job_uuid
-      title: CancelTrainingJobRequest
    ChatCompletionRequest:
      type: object
      properties:
@ -5372,6 +5071,65 @@ components:
        - model_id
        - messages
      title: ChatCompletionRequest
+    ChatCompletionResponse:
+      type: object
+      properties:
+        metrics:
+          type: array
+          items:
+            $ref: '#/components/schemas/MetricInResponse'
+          description: >-
+            (Optional) List of metrics associated with the API response
+        completion_message:
+          $ref: '#/components/schemas/CompletionMessage'
+          description: The complete response message
+        logprobs:
+          type: array
+          items:
+            $ref: '#/components/schemas/TokenLogProbs'
+          description: >-
+            Optional log probabilities for generated tokens
+      additionalProperties: false
+      required:
+        - completion_message
+      title: ChatCompletionResponse
+      description: Response from a chat completion request.
+    MetricInResponse:
+      type: object
+      properties:
+        metric:
+          type: string
+          description: The name of the metric
+        value:
+          oneOf:
+            - type: integer
+            - type: number
+          description: The numeric value of the metric
+        unit:
+          type: string
+          description: >-
+            (Optional) The unit of measurement for the metric value
+      additionalProperties: false
+      required:
+        - metric
+        - value
+      title: MetricInResponse
+      description: >-
+        A metric value included in API responses.
+    TokenLogProbs:
+      type: object
+      properties:
+        logprobs_by_token:
+          type: object
+          additionalProperties:
+            type: number
+          description: >-
+            Dictionary mapping tokens to their log probabilities
+      additionalProperties: false
+      required:
+        - logprobs_by_token
+      title: TokenLogProbs
+      description: Log probabilities for generated tokens.
    ChatCompletionResponseEvent:
      type: object
      properties:
@ -5507,81 +5265,6 @@ components:
      title: ToolCallDelta
      description: >-
        A tool call content delta for streaming responses.
-    CompletionRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: >-
-            The identifier of the model to use. The model must be registered with
-            Llama Stack and available via the /models endpoint.
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            The content to generate a completion for.
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: >-
-            (Optional) Parameters to control the sampling strategy.
-        response_format:
-          $ref: '#/components/schemas/ResponseFormat'
-          description: >-
-            (Optional) Grammar specification for guided (structured) decoding.
-        stream:
-          type: boolean
-          description: >-
-            (Optional) If True, generate an SSE event stream of the response. Defaults
-            to False.
-        logprobs:
-          type: object
-          properties:
-            top_k:
-              type: integer
-              default: 0
-              description: >-
-                How many tokens (for each position) to return log probabilities for.
-          additionalProperties: false
-          description: >-
-            (Optional) If specified, log probabilities for each token position will
-            be returned.
-      additionalProperties: false
-      required:
-        - model_id
-        - content
-      title: CompletionRequest
-    CompletionResponseStreamChunk:
-      type: object
-      properties:
-        metrics:
-          type: array
-          items:
-            $ref: '#/components/schemas/MetricInResponse'
-          description: >-
-            (Optional) List of metrics associated with the API response
-        delta:
-          type: string
-          description: >-
-            New content generated since last chunk. This can be one or more tokens.
-        stop_reason:
-          type: string
-          enum:
-            - end_of_turn
-            - end_of_message
-            - out_of_tokens
-          description: >-
-            Optional reason why generation stopped, if complete
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-          description: >-
-            Optional log probabilities for generated tokens
-      additionalProperties: false
-      required:
-        - delta
-      title: CompletionResponseStreamChunk
-      description: >-
-        A chunk of a streamed completion response.
    AgentConfig:
      type: object
      properties:
@ -5730,6 +5413,13 @@ components:
          default: true
          description: >-
            Whether this parameter is required for tool invocation
+        items:
+          type: object
+          description: >-
+            Type of the elements when parameter_type is array
+        title:
+          type: string
+          description: (Optional) Title of the parameter
        default:
          oneOf:
            - type: 'null'
@ -13983,18 +13673,6 @@ tags:
      the RAG Tool and Vector IO APIs for more details.
    x-displayName: >-
      Agents API for creating and interacting with agentic systems.
-  - name: BatchInference (Coming Soon)
-    description: >-
-      This is an asynchronous API. If the request is successful, the response will
-      be a job which can be polled for completion.
-
-
-      NOTE: This API is not yet implemented and is subject to change in concert with
-      other asynchronous APIs
-
-      including (post-training, evals, etc).
-    x-displayName: >-
-      Batch inference API for generating completions and chat completions.
  - name: Benchmarks
  - name: DatasetIO
  - name: Datasets
@ -14037,7 +13715,6 @@ x-tagGroups:
  - name: Operations
    tags:
      - Agents
-      - BatchInference (Coming Soon)
      - Benchmarks
      - DatasetIO
      - Datasets
--- a/llama_stack/apis/batch_inference/batch_inference.py
+++ b/llama_stack/apis/batch_inference/batch_inference.py
@ -1,79 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Protocol, runtime_checkable
-
-from llama_stack.apis.common.job_types import Job
-from llama_stack.apis.inference import (
-    InterleavedContent,
-    LogProbConfig,
-    Message,
-    ResponseFormat,
-    SamplingParams,
-    ToolChoice,
-    ToolDefinition,
-    ToolPromptFormat,
-)
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import webmethod
-
-
-@runtime_checkable
-class BatchInference(Protocol):
-    """Batch inference API for generating completions and chat completions.
-
-    This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.
-
-    NOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs
-    including (post-training, evals, etc).
-    """
-
-    @webmethod(route="/batch-inference/completion", method="POST", level=LLAMA_STACK_API_V1)
-    async def completion(
-        self,
-        model: str,
-        content_batch: list[InterleavedContent],
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ) -> Job:
-        """Generate completions for a batch of content.
-
-        :param model: The model to use for the completion.
-        :param content_batch: The content to complete.
-        :param sampling_params: The sampling parameters to use for the completion.
-        :param response_format: The response format to use for the completion.
-        :param logprobs: The logprobs to use for the completion.
-        :returns: A job for the completion.
-        """
-        ...
-
-    @webmethod(route="/batch-inference/chat-completion", method="POST", level=LLAMA_STACK_API_V1)
-    async def chat_completion(
-        self,
-        model: str,
-        messages_batch: list[list[Message]],
-        sampling_params: SamplingParams | None = None,
-        # zero-shot tool definitions as input to the model
-        tools: list[ToolDefinition] | None = None,
-        tool_choice: ToolChoice | None = ToolChoice.auto,
-        tool_prompt_format: ToolPromptFormat | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ) -> Job:
-        """Generate chat completions for a batch of messages.
-
-        :param model: The model to use for the chat completion.
-        :param messages_batch: The messages to complete.
-        :param sampling_params: The sampling parameters to use for the completion.
-        :param tools: The tools to use for the chat completion.
-        :param tool_choice: The tool choice to use for the chat completion.
-        :param tool_prompt_format: The tool prompt format to use for the chat completion.
-        :param response_format: The response format to use for the chat completion.
-        :param logprobs: The logprobs to use for the chat completion.
-        :returns: A job for the chat completion.
-        """
-        ...
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -975,26 +975,6 @@ class EmbeddingTaskType(Enum):
    document = "document"


-@json_schema_type
-class BatchCompletionResponse(BaseModel):
-    """Response from a batch completion request.
-
-    :param batch: List of completion responses, one for each input in the batch
-    """
-
-    batch: list[CompletionResponse]
-
-
-@json_schema_type
-class BatchChatCompletionResponse(BaseModel):
-    """Response from a batch chat completion request.
-
-    :param batch: List of chat completion responses, one for each conversation in the batch
-    """
-
-    batch: list[ChatCompletionResponse]
-
-
 class OpenAICompletionWithInputMessages(OpenAIChatCompletion):
    input_messages: list[OpenAIMessageParam]

@ -1028,7 +1008,6 @@ class InferenceProvider(Protocol):

    model_store: ModelStore | None = None

-    @webmethod(route="/inference/completion", method="POST", level=LLAMA_STACK_API_V1)
    async def completion(
        self,
        model_id: str,
@ -1051,27 +1030,6 @@ class InferenceProvider(Protocol):
        """
        ...

-    @webmethod(route="/inference/batch-completion", method="POST", experimental=True, level=LLAMA_STACK_API_V1)
-    async def batch_completion(
-        self,
-        model_id: str,
-        content_batch: list[InterleavedContent],
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ) -> BatchCompletionResponse:
-        """Generate completions for a batch of content using the specified model.
-
-        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param content_batch: The content to generate completions for.
-        :param sampling_params: (Optional) Parameters to control the sampling strategy.
-        :param response_format: (Optional) Grammar specification for guided (structured) decoding.
-        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
-        :returns: A BatchCompletionResponse with the full completions.
-        """
-        raise NotImplementedError("Batch completion is not implemented")
-        return  # this is so mypy's safe-super rule will consider the method concrete
-
    @webmethod(route="/inference/chat-completion", method="POST", level=LLAMA_STACK_API_V1)
    async def chat_completion(
        self,
@ -1112,31 +1070,6 @@ class InferenceProvider(Protocol):
        """
        ...

-    @webmethod(route="/inference/batch-chat-completion", method="POST", experimental=True, level=LLAMA_STACK_API_V1)
-    async def batch_chat_completion(
-        self,
-        model_id: str,
-        messages_batch: list[list[Message]],
-        sampling_params: SamplingParams | None = None,
-        tools: list[ToolDefinition] | None = None,
-        tool_config: ToolConfig | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ) -> BatchChatCompletionResponse:
-        """Generate chat completions for a batch of messages using the specified model.
-
-        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param messages_batch: The messages to generate completions for.
-        :param sampling_params: (Optional) Parameters to control the sampling strategy.
-        :param tools: (Optional) List of tool definitions available to the model.
-        :param tool_config: (Optional) Configuration for tool use.
-        :param response_format: (Optional) Grammar specification for guided (structured) decoding.
-        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
-        :returns: A BatchChatCompletionResponse with the full completions.
-        """
-        raise NotImplementedError("Batch chat completion is not implemented")
-        return  # this is so mypy's safe-super rule will consider the method concrete
-
    @webmethod(route="/inference/embeddings", method="POST", level=LLAMA_STACK_API_V1)
    async def embeddings(
        self,
--- a/llama_stack/apis/tools/tools.py
+++ b/llama_stack/apis/tools/tools.py
@ -27,6 +27,8 @@ class ToolParameter(BaseModel):
    :param parameter_type: Type of the parameter (e.g., string, integer)
    :param description: Human-readable description of what the parameter does
    :param required: Whether this parameter is required for tool invocation
+    :param items: Type of the elements when parameter_type is array
+    :param title: (Optional) Title of the parameter
    :param default: (Optional) Default value for the parameter if not provided
    """

@ -34,6 +36,8 @@ class ToolParameter(BaseModel):
    parameter_type: str
    description: str
    required: bool = Field(default=True)
+    items: dict | None = None
+    title: str | None = None
    default: Any | None = None


--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@ -20,8 +20,6 @@ from llama_stack.apis.common.content_types import (
 )
 from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
 from llama_stack.apis.inference import (
-    BatchChatCompletionResponse,
-    BatchCompletionResponse,
    ChatCompletionResponse,
    ChatCompletionResponseEventType,
    ChatCompletionResponseStreamChunk,
@ -273,30 +271,6 @@ class InferenceRouter(Inference):
        )
        return response

-    async def batch_chat_completion(
-        self,
-        model_id: str,
-        messages_batch: list[list[Message]],
-        tools: list[ToolDefinition] | None = None,
-        tool_config: ToolConfig | None = None,
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ) -> BatchChatCompletionResponse:
-        logger.debug(
-            f"InferenceRouter.batch_chat_completion: {model_id=}, {len(messages_batch)=}, {sampling_params=}, {response_format=}, {logprobs=}",
-        )
-        provider = await self.routing_table.get_provider_impl(model_id)
-        return await provider.batch_chat_completion(
-            model_id=model_id,
-            messages_batch=messages_batch,
-            tools=tools,
-            tool_config=tool_config,
-            sampling_params=sampling_params,
-            response_format=response_format,
-            logprobs=logprobs,
-        )
-
    async def completion(
        self,
        model_id: str,
@ -338,20 +312,6 @@ class InferenceRouter(Inference):

        return response

-    async def batch_completion(
-        self,
-        model_id: str,
-        content_batch: list[InterleavedContent],
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ) -> BatchCompletionResponse:
-        logger.debug(
-            f"InferenceRouter.batch_completion: {model_id=}, {len(content_batch)=}, {sampling_params=}, {response_format=}, {logprobs=}",
-        )
-        provider = await self.routing_table.get_provider_impl(model_id)
-        return await provider.batch_completion(model_id, content_batch, sampling_params, response_format, logprobs)
-
    async def embeddings(
        self,
        model_id: str,
--- a/llama_stack/core/stack.py
+++ b/llama_stack/core/stack.py
@ -14,7 +14,6 @@ from typing import Any
 import yaml

 from llama_stack.apis.agents import Agents
-from llama_stack.apis.batch_inference import BatchInference
 from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
@ -54,7 +53,6 @@ class LlamaStack(
    Providers,
    VectorDBs,
    Inference,
-    BatchInference,
    Agents,
    Safety,
    SyntheticDataGeneration,
--- a/llama_stack/models/llama/datatypes.py
+++ b/llama_stack/models/llama/datatypes.py
@ -92,6 +92,8 @@ class ToolParamDefinition(BaseModel):
    param_type: str
    description: str | None = None
    required: bool | None = True
+    items: Any | None = None
+    title: str | None = None
    default: Any | None = None


--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@ -798,6 +798,8 @@ class ChatAgent(ShieldRunnerMixin):
                        param_type=param.parameter_type,
                        description=param.description,
                        required=param.required,
+                        items=param.items,
+                        title=param.title,
                        default=param.default,
                    )
                    for param in tool_def.parameters
@ -841,6 +843,8 @@ class ChatAgent(ShieldRunnerMixin):
                                param_type=param.parameter_type,
                                description=param.description,
                                required=param.required,
+                                items=param.items,
+                                title=param.title,
                                default=param.default,
                            )
                            for param in tool_def.parameters
@ -920,7 +924,7 @@ async def get_raw_document_text(document: Document) -> str:
            DeprecationWarning,
            stacklevel=2,
        )
-    elif not (document.mime_type.startswith("text/") or document.mime_type == "application/yaml"):
+    elif not (document.mime_type.startswith("text/") or document.mime_type in ("application/yaml", "application/json")):
        raise ValueError(f"Unexpected document mime type: {document.mime_type}")

    if isinstance(document.content, URL):
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@ -568,6 +568,7 @@ class StreamingResponseOrchestrator:
                                description=param.description,
                                required=param.required,
                                default=param.default,
+                                items=param.items,
                            )
                            for param in t.parameters
                        },
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -18,8 +18,6 @@ from llama_stack.apis.common.content_types import (
    ToolCallParseStatus,
 )
 from llama_stack.apis.inference import (
-    BatchChatCompletionResponse,
-    BatchCompletionResponse,
    ChatCompletionRequest,
    ChatCompletionResponse,
    ChatCompletionResponseEvent,
@ -219,41 +217,6 @@ class MetaReferenceInferenceImpl(
            results = await self._nonstream_completion([request])
            return results[0]

-    async def batch_completion(
-        self,
-        model_id: str,
-        content_batch: list[InterleavedContent],
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        stream: bool | None = False,
-        logprobs: LogProbConfig | None = None,
-    ) -> BatchCompletionResponse:
-        if sampling_params is None:
-            sampling_params = SamplingParams()
-        if logprobs:
-            assert logprobs.top_k == 1, f"Unexpected top_k={logprobs.top_k}"
-
-        content_batch = [
-            augment_content_with_response_format_prompt(response_format, content) for content in content_batch
-        ]
-
-        request_batch = []
-        for content in content_batch:
-            request = CompletionRequest(
-                model=model_id,
-                content=content,
-                sampling_params=sampling_params,
-                response_format=response_format,
-                stream=stream,
-                logprobs=logprobs,
-            )
-            self.check_model(request)
-            request = await convert_request_to_raw(request)
-            request_batch.append(request)
-
-        results = await self._nonstream_completion(request_batch)
-        return BatchCompletionResponse(batch=results)
-
    async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator:
        tokenizer = self.generator.formatter.tokenizer

@ -399,49 +362,6 @@ class MetaReferenceInferenceImpl(
            results = await self._nonstream_chat_completion([request])
            return results[0]

-    async def batch_chat_completion(
-        self,
-        model_id: str,
-        messages_batch: list[list[Message]],
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        tools: list[ToolDefinition] | None = None,
-        stream: bool | None = False,
-        logprobs: LogProbConfig | None = None,
-        tool_config: ToolConfig | None = None,
-    ) -> BatchChatCompletionResponse:
-        if sampling_params is None:
-            sampling_params = SamplingParams()
-        if logprobs:
-            assert logprobs.top_k == 1, f"Unexpected top_k={logprobs.top_k}"
-
-        # wrapper request to make it easier to pass around (internal only, not exposed to API)
-        request_batch = []
-        for messages in messages_batch:
-            request = ChatCompletionRequest(
-                model=model_id,
-                messages=messages,
-                sampling_params=sampling_params,
-                tools=tools or [],
-                response_format=response_format,
-                logprobs=logprobs,
-                tool_config=tool_config or ToolConfig(),
-            )
-            self.check_model(request)
-
-            # augment and rewrite messages depending on the model
-            request.messages = chat_completion_request_to_messages(request, self.llama_model.core_model_id.value)
-            # download media and convert to raw content so we can send it to the model
-            request = await convert_request_to_raw(request)
-            request_batch.append(request)
-
-        if self.config.create_distributed_process_group:
-            if SEMAPHORE.locked():
-                raise RuntimeError("Only one concurrent request is supported")
-
-        results = await self._nonstream_chat_completion(request_batch)
-        return BatchChatCompletionResponse(batch=results)
-
    async def _nonstream_chat_completion(
        self, request_batch: list[ChatCompletionRequest]
    ) -> list[ChatCompletionResponse]:
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -61,6 +61,7 @@ logger = get_logger(name=__name__, category="inference::fireworks")
 class FireworksInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, NeedsRequestProviderData):
    embedding_model_metadata = {
        "nomic-ai/nomic-embed-text-v1.5": {"embedding_dimension": 768, "context_length": 8192},
+        "accounts/fireworks/models/qwen3-embedding-8b": {"embedding_dimension": 4096, "context_length": 40960},
    }

    def __init__(self, config: FireworksImplConfig) -> None:
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -6,8 +6,7 @@


 import asyncio
-import base64
-from collections.abc import AsyncGenerator, AsyncIterator
+from collections.abc import AsyncGenerator
 from typing import Any

 from ollama import AsyncClient as AsyncOllamaClient
@ -33,10 +32,6 @@ from llama_stack.apis.inference import (
    JsonSchemaResponseFormat,
    LogProbConfig,
    Message,
-    OpenAIChatCompletion,
-    OpenAIChatCompletionChunk,
-    OpenAIMessageParam,
-    OpenAIResponseFormatParam,
    ResponseFormat,
    SamplingParams,
    TextTruncation,
@ -62,7 +57,6 @@ from llama_stack.providers.utils.inference.openai_compat import (
    OpenAICompatCompletionChoice,
    OpenAICompatCompletionResponse,
    get_sampling_options,
-    prepare_openai_completion_params,
    process_chat_completion_response,
    process_chat_completion_stream_response,
    process_completion_response,
@ -75,7 +69,6 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
    content_has_media,
    convert_image_content_to_url,
    interleaved_content_as_str,
-    localize_image_content,
    request_has_media,
 )

@ -84,6 +77,7 @@ logger = get_logger(name=__name__, category="inference::ollama")

 class OllamaInferenceAdapter(
    OpenAIMixin,
+    ModelRegistryHelper,
    InferenceProvider,
    ModelsProtocolPrivate,
 ):
@ -129,6 +123,8 @@ class OllamaInferenceAdapter(
            ],
        )
        self.config = config
+        # Ollama does not support image urls, so we need to download the image and convert it to base64
+        self.download_images = True
        self._clients: dict[asyncio.AbstractEventLoop, AsyncOllamaClient] = {}

    @property
@ -173,9 +169,6 @@ class OllamaInferenceAdapter(
    async def shutdown(self) -> None:
        self._clients.clear()

-    async def unregister_model(self, model_id: str) -> None:
-        pass
-
    async def _get_model(self, model_id: str) -> Model:
        if not self.model_store:
            raise ValueError("Model store not set")
@ -403,75 +396,6 @@ class OllamaInferenceAdapter(

        raise UnsupportedModelError(model.provider_model_id, list(self._model_cache.keys()))

-    async def openai_chat_completion(
-        self,
-        model: str,
-        messages: list[OpenAIMessageParam],
-        frequency_penalty: float | None = None,
-        function_call: str | dict[str, Any] | None = None,
-        functions: list[dict[str, Any]] | None = None,
-        logit_bias: dict[str, float] | None = None,
-        logprobs: bool | None = None,
-        max_completion_tokens: int | None = None,
-        max_tokens: int | None = None,
-        n: int | None = None,
-        parallel_tool_calls: bool | None = None,
-        presence_penalty: float | None = None,
-        response_format: OpenAIResponseFormatParam | None = None,
-        seed: int | None = None,
-        stop: str | list[str] | None = None,
-        stream: bool | None = None,
-        stream_options: dict[str, Any] | None = None,
-        temperature: float | None = None,
-        tool_choice: str | dict[str, Any] | None = None,
-        tools: list[dict[str, Any]] | None = None,
-        top_logprobs: int | None = None,
-        top_p: float | None = None,
-        user: str | None = None,
-    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        model_obj = await self._get_model(model)
-
-        # Ollama does not support image urls, so we need to download the image and convert it to base64
-        async def _convert_message(m: OpenAIMessageParam) -> OpenAIMessageParam:
-            if isinstance(m.content, list):
-                for c in m.content:
-                    if c.type == "image_url" and c.image_url and c.image_url.url:
-                        localize_result = await localize_image_content(c.image_url.url)
-                        if localize_result is None:
-                            raise ValueError(f"Failed to localize image content from {c.image_url.url}")
-
-                        content, format = localize_result
-                        c.image_url.url = f"data:image/{format};base64,{base64.b64encode(content).decode('utf-8')}"
-            return m
-
-        messages = [await _convert_message(m) for m in messages]
-        params = await prepare_openai_completion_params(
-            model=model_obj.provider_resource_id,
-            messages=messages,
-            frequency_penalty=frequency_penalty,
-            function_call=function_call,
-            functions=functions,
-            logit_bias=logit_bias,
-            logprobs=logprobs,
-            max_completion_tokens=max_completion_tokens,
-            max_tokens=max_tokens,
-            n=n,
-            parallel_tool_calls=parallel_tool_calls,
-            presence_penalty=presence_penalty,
-            response_format=response_format,
-            seed=seed,
-            stop=stop,
-            stream=stream,
-            stream_options=stream_options,
-            temperature=temperature,
-            tool_choice=tool_choice,
-            tools=tools,
-            top_logprobs=top_logprobs,
-            top_p=top_p,
-            user=user,
-        )
-        return await OpenAIMixin.openai_chat_completion(self, **params)
-

 async def convert_message_to_openai_dict_for_ollama(message: Message) -> list[dict]:
    async def _convert_content(content) -> dict:
--- a/llama_stack/providers/remote/inference/openai/openai.py
+++ b/llama_stack/providers/remote/inference/openai/openai.py
@ -21,8 +21,6 @@ logger = get_logger(name=__name__, category="inference::openai")
 # | completion                 | LiteLLMOpenAIMixin       |
 # | chat_completion            | LiteLLMOpenAIMixin       |
 # | embedding                  | LiteLLMOpenAIMixin       |
-# | batch_completion           | LiteLLMOpenAIMixin       |
-# | batch_chat_completion      | LiteLLMOpenAIMixin       |
 # | openai_completion          | OpenAIMixin              |
 # | openai_chat_completion     | OpenAIMixin              |
 # | openai_embeddings          | OpenAIMixin              |
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -805,6 +805,10 @@ def convert_tooldef_to_openai_tool(tool: ToolDefinition) -> dict:
                properties[param_name].update(description=param.description)
            if param.default:
                properties[param_name].update(default=param.default)
+            if param.items:
+                properties[param_name].update(items=param.items)
+            if param.title:
+                properties[param_name].update(title=param.title)
            if param.required:
                required.append(param_name)

--- a/llama_stack/providers/utils/inference/openai_mixin.py
+++ b/llama_stack/providers/utils/inference/openai_mixin.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import base64
 import uuid
 from abc import ABC, abstractmethod
 from collections.abc import AsyncIterator
@ -26,6 +27,7 @@ from llama_stack.apis.models import ModelType
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
 from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
+from llama_stack.providers.utils.inference.prompt_adapter import localize_image_content

 logger = get_logger(name=__name__, category="providers::utils")

@ -51,6 +53,10 @@ class OpenAIMixin(ModelRegistryHelper, ABC):
    # This is useful for providers that do not return a unique id in the response.
    overwrite_completion_id: bool = False

+    # Allow subclasses to control whether to download images and convert to base64
+    # for providers that require base64 encoded images instead of URLs.
+    download_images: bool = False
+
    # Embedding model metadata for this provider
    # Can be set by subclasses or instances to provide embedding models
    # Format: {"model_id": {"embedding_dimension": 1536, "context_length": 8192}}
@ -239,6 +245,24 @@ class OpenAIMixin(ModelRegistryHelper, ABC):
        """
        Direct OpenAI chat completion API call.
        """
+        if self.download_images:
+
+            async def _localize_image_url(m: OpenAIMessageParam) -> OpenAIMessageParam:
+                if isinstance(m.content, list):
+                    for c in m.content:
+                        if c.type == "image_url" and c.image_url and c.image_url.url and "http" in c.image_url.url:
+                            localize_result = await localize_image_content(c.image_url.url)
+                            if localize_result is None:
+                                raise ValueError(
+                                    f"Failed to localize image content from {c.image_url.url[:42]}{'...' if len(c.image_url.url) > 42 else ''}"
+                                )
+                            content, format = localize_result
+                            c.image_url.url = f"data:image/{format};base64,{base64.b64encode(content).decode('utf-8')}"
+                # else it's a string and we don't need to modify it
+                return m
+
+            messages = [await _localize_image_url(m) for m in messages]
+
        resp = await self.client.chat.completions.create(
            **await prepare_openai_completion_params(
                model=await self._get_provider_model_id(model),
--- a/llama_stack/providers/utils/inference/prompt_adapter.py
+++ b/llama_stack/providers/utils/inference/prompt_adapter.py
@ -192,6 +192,14 @@ async def localize_image_content(uri: str) -> tuple[bytes, str] | None:
                format = "png"

        return content, format
+    elif uri.startswith("data"):
+        # data:image/{format};base64,{data}
+        match = re.match(r"data:image/(\w+);base64,(.+)", uri)
+        if not match:
+            raise ValueError(f"Invalid data URL format, {uri[:40]}...")
+        fmt, image_data = match.groups()
+        content = base64.b64decode(image_data)
+        return content, fmt
    else:
        return None

--- a/llama_stack/providers/utils/tools/mcp.py
+++ b/llama_stack/providers/utils/tools/mcp.py
@ -120,6 +120,10 @@ async def list_mcp_tools(endpoint: str, headers: dict[str, str]) -> ListToolDefs
                        name=param_name,
                        parameter_type=param_schema.get("type", "string"),
                        description=param_schema.get("description", ""),
+                        required="default" not in param_schema,
+                        items=param_schema.get("items", None),
+                        title=param_schema.get("title", None),
+                        default=param_schema.get("default", None),
                    )
                )
            tools.append(
--- a/llama_stack/ui/package-lock.json
+++ b/llama_stack/ui/package-lock.json
@ -28,7 +28,7 @@
        "react-markdown": "^10.1.0",
        "remark-gfm": "^4.0.1",
        "remeda": "^2.32.0",
-        "shiki": "^1.29.2",
+        "shiki": "^3.13.0",
        "sonner": "^2.0.7",
        "tailwind-merge": "^3.3.1"
      },
@ -51,7 +51,7 @@
        "prettier": "3.6.2",
        "tailwindcss": "^4",
        "ts-node": "^10.9.2",
-        "tw-animate-css": "^1.2.9",
+        "tw-animate-css": "^1.4.0",
        "typescript": "^5"
      }
    },
@ -3250,65 +3250,63 @@
      "license": "MIT"
    },
    "node_modules/@shikijs/core": {
-      "version": "1.29.2",
-      "resolved": "https://registry.npmjs.org/@shikijs/core/-/core-1.29.2.tgz",
-      "integrity": "sha512-vju0lY9r27jJfOY4Z7+Rt/nIOjzJpZ3y+nYpqtUZInVoXQ/TJZcfGnNOGnKjFdVZb8qexiCuSlZRKcGfhhTTZQ==",
+      "version": "3.13.0",
+      "resolved": "https://registry.npmjs.org/@shikijs/core/-/core-3.13.0.tgz",
+      "integrity": "sha512-3P8rGsg2Eh2qIHekwuQjzWhKI4jV97PhvYjYUzGqjvJfqdQPz+nMlfWahU24GZAyW1FxFI1sYjyhfh5CoLmIUA==",
      "license": "MIT",
      "dependencies": {
-        "@shikijs/engine-javascript": "1.29.2",
-        "@shikijs/engine-oniguruma": "1.29.2",
-        "@shikijs/types": "1.29.2",
-        "@shikijs/vscode-textmate": "^10.0.1",
+        "@shikijs/types": "3.13.0",
+        "@shikijs/vscode-textmate": "^10.0.2",
        "@types/hast": "^3.0.4",
-        "hast-util-to-html": "^9.0.4"
+        "hast-util-to-html": "^9.0.5"
      }
    },
    "node_modules/@shikijs/engine-javascript": {
-      "version": "1.29.2",
-      "resolved": "https://registry.npmjs.org/@shikijs/engine-javascript/-/engine-javascript-1.29.2.tgz",
-      "integrity": "sha512-iNEZv4IrLYPv64Q6k7EPpOCE/nuvGiKl7zxdq0WFuRPF5PAE9PRo2JGq/d8crLusM59BRemJ4eOqrFrC4wiQ+A==",
+      "version": "3.13.0",
+      "resolved": "https://registry.npmjs.org/@shikijs/engine-javascript/-/engine-javascript-3.13.0.tgz",
+      "integrity": "sha512-Ty7xv32XCp8u0eQt8rItpMs6rU9Ki6LJ1dQOW3V/56PKDcpvfHPnYFbsx5FFUP2Yim34m/UkazidamMNVR4vKg==",
      "license": "MIT",
      "dependencies": {
-        "@shikijs/types": "1.29.2",
-        "@shikijs/vscode-textmate": "^10.0.1",
-        "oniguruma-to-es": "^2.2.0"
+        "@shikijs/types": "3.13.0",
+        "@shikijs/vscode-textmate": "^10.0.2",
+        "oniguruma-to-es": "^4.3.3"
      }
    },
    "node_modules/@shikijs/engine-oniguruma": {
-      "version": "1.29.2",
-      "resolved": "https://registry.npmjs.org/@shikijs/engine-oniguruma/-/engine-oniguruma-1.29.2.tgz",
-      "integrity": "sha512-7iiOx3SG8+g1MnlzZVDYiaeHe7Ez2Kf2HrJzdmGwkRisT7r4rak0e655AcM/tF9JG/kg5fMNYlLLKglbN7gBqA==",
+      "version": "3.13.0",
+      "resolved": "https://registry.npmjs.org/@shikijs/engine-oniguruma/-/engine-oniguruma-3.13.0.tgz",
+      "integrity": "sha512-O42rBGr4UDSlhT2ZFMxqM7QzIU+IcpoTMzb3W7AlziI1ZF7R8eS2M0yt5Ry35nnnTX/LTLXFPUjRFCIW+Operg==",
      "license": "MIT",
      "dependencies": {
-        "@shikijs/types": "1.29.2",
-        "@shikijs/vscode-textmate": "^10.0.1"
+        "@shikijs/types": "3.13.0",
+        "@shikijs/vscode-textmate": "^10.0.2"
      }
    },
    "node_modules/@shikijs/langs": {
-      "version": "1.29.2",
-      "resolved": "https://registry.npmjs.org/@shikijs/langs/-/langs-1.29.2.tgz",
-      "integrity": "sha512-FIBA7N3LZ+223U7cJDUYd5shmciFQlYkFXlkKVaHsCPgfVLiO+e12FmQE6Tf9vuyEsFe3dIl8qGWKXgEHL9wmQ==",
+      "version": "3.13.0",
+      "resolved": "https://registry.npmjs.org/@shikijs/langs/-/langs-3.13.0.tgz",
+      "integrity": "sha512-672c3WAETDYHwrRP0yLy3W1QYB89Hbpj+pO4KhxK6FzIrDI2FoEXNiNCut6BQmEApYLfuYfpgOZaqbY+E9b8wQ==",
      "license": "MIT",
      "dependencies": {
-        "@shikijs/types": "1.29.2"
+        "@shikijs/types": "3.13.0"
      }
    },
    "node_modules/@shikijs/themes": {
-      "version": "1.29.2",
-      "resolved": "https://registry.npmjs.org/@shikijs/themes/-/themes-1.29.2.tgz",
-      "integrity": "sha512-i9TNZlsq4uoyqSbluIcZkmPL9Bfi3djVxRnofUHwvx/h6SRW3cwgBC5SML7vsDcWyukY0eCzVN980rqP6qNl9g==",
+      "version": "3.13.0",
+      "resolved": "https://registry.npmjs.org/@shikijs/themes/-/themes-3.13.0.tgz",
+      "integrity": "sha512-Vxw1Nm1/Od8jyA7QuAenaV78BG2nSr3/gCGdBkLpfLscddCkzkL36Q5b67SrLLfvAJTOUzW39x4FHVCFriPVgg==",
      "license": "MIT",
      "dependencies": {
-        "@shikijs/types": "1.29.2"
+        "@shikijs/types": "3.13.0"
      }
    },
    "node_modules/@shikijs/types": {
-      "version": "1.29.2",
-      "resolved": "https://registry.npmjs.org/@shikijs/types/-/types-1.29.2.tgz",
-      "integrity": "sha512-VJjK0eIijTZf0QSTODEXCqinjBn0joAHQ+aPSBzrv4O2d/QSbsMw+ZeSRx03kV34Hy7NzUvV/7NqfYGRLrASmw==",
+      "version": "3.13.0",
+      "resolved": "https://registry.npmjs.org/@shikijs/types/-/types-3.13.0.tgz",
+      "integrity": "sha512-oM9P+NCFri/mmQ8LoFGVfVyemm5Hi27330zuOBp0annwJdKH1kOLndw3zCtAVDehPLg9fKqoEx3Ht/wNZxolfw==",
      "license": "MIT",
      "dependencies": {
-        "@shikijs/vscode-textmate": "^10.0.1",
+        "@shikijs/vscode-textmate": "^10.0.2",
        "@types/hast": "^3.0.4"
      }
    },
@ -6084,12 +6082,6 @@
      "dev": true,
      "license": "MIT"
    },
-    "node_modules/emoji-regex-xs": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/emoji-regex-xs/-/emoji-regex-xs-1.0.0.tgz",
-      "integrity": "sha512-LRlerrMYoIDrT6jgpeZ2YYl/L8EulRTt5hQcYjy5AInh7HWXKimpqx68aknBFpGL2+/IcogTcaydJEgaTmOpDg==",
-      "license": "MIT"
-    },
    "node_modules/encodeurl": {
      "version": "2.0.0",
      "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz",
@ -11813,15 +11805,21 @@
        "url": "https://github.com/sponsors/sindresorhus"
      }
    },
+    "node_modules/oniguruma-parser": {
+      "version": "0.12.1",
+      "resolved": "https://registry.npmjs.org/oniguruma-parser/-/oniguruma-parser-0.12.1.tgz",
+      "integrity": "sha512-8Unqkvk1RYc6yq2WBYRj4hdnsAxVze8i7iPfQr8e4uSP3tRv0rpZcbGUDvxfQQcdwHt/e9PrMvGCsa8OqG9X3w==",
+      "license": "MIT"
+    },
    "node_modules/oniguruma-to-es": {
-      "version": "2.3.0",
-      "resolved": "https://registry.npmjs.org/oniguruma-to-es/-/oniguruma-to-es-2.3.0.tgz",
-      "integrity": "sha512-bwALDxriqfKGfUufKGGepCzu9x7nJQuoRoAFp4AnwehhC2crqrDIAP/uN2qdlsAvSMpeRC3+Yzhqc7hLmle5+g==",
+      "version": "4.3.3",
+      "resolved": "https://registry.npmjs.org/oniguruma-to-es/-/oniguruma-to-es-4.3.3.tgz",
+      "integrity": "sha512-rPiZhzC3wXwE59YQMRDodUwwT9FZ9nNBwQQfsd1wfdtlKEyCdRV0avrTcSZ5xlIvGRVPd/cx6ZN45ECmS39xvg==",
      "license": "MIT",
      "dependencies": {
-        "emoji-regex-xs": "^1.0.0",
-        "regex": "^5.1.1",
-        "regex-recursion": "^5.1.1"
+        "oniguruma-parser": "^0.12.1",
+        "regex": "^6.0.1",
+        "regex-recursion": "^6.0.2"
      }
    },
    "node_modules/openid-client": {
@ -12613,21 +12611,20 @@
      }
    },
    "node_modules/regex": {
-      "version": "5.1.1",
-      "resolved": "https://registry.npmjs.org/regex/-/regex-5.1.1.tgz",
-      "integrity": "sha512-dN5I359AVGPnwzJm2jN1k0W9LPZ+ePvoOeVMMfqIMFz53sSwXkxaJoxr50ptnsC771lK95BnTrVSZxq0b9yCGw==",
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/regex/-/regex-6.0.1.tgz",
+      "integrity": "sha512-uorlqlzAKjKQZ5P+kTJr3eeJGSVroLKoHmquUj4zHWuR+hEyNqlXsSKlYYF5F4NI6nl7tWCs0apKJ0lmfsXAPA==",
      "license": "MIT",
      "dependencies": {
        "regex-utilities": "^2.3.0"
      }
    },
    "node_modules/regex-recursion": {
-      "version": "5.1.1",
-      "resolved": "https://registry.npmjs.org/regex-recursion/-/regex-recursion-5.1.1.tgz",
-      "integrity": "sha512-ae7SBCbzVNrIjgSbh7wMznPcQel1DNlDtzensnFxpiNpXt1U2ju/bHugH422r+4LAVS1FpW1YCwilmnNsjum9w==",
+      "version": "6.0.2",
+      "resolved": "https://registry.npmjs.org/regex-recursion/-/regex-recursion-6.0.2.tgz",
+      "integrity": "sha512-0YCaSCq2VRIebiaUviZNs0cBz1kg5kVS2UKUfNIx8YVs1cN3AV7NTctO5FOKBA+UT2BPJIWZauYHPqJODG50cg==",
      "license": "MIT",
      "dependencies": {
-        "regex": "^5.1.1",
        "regex-utilities": "^2.3.0"
      }
    },
@ -13165,18 +13162,18 @@
      }
    },
    "node_modules/shiki": {
-      "version": "1.29.2",
-      "resolved": "https://registry.npmjs.org/shiki/-/shiki-1.29.2.tgz",
-      "integrity": "sha512-njXuliz/cP+67jU2hukkxCNuH1yUi4QfdZZY+sMr5PPrIyXSu5iTb/qYC4BiWWB0vZ+7TbdvYUCeL23zpwCfbg==",
+      "version": "3.13.0",
+      "resolved": "https://registry.npmjs.org/shiki/-/shiki-3.13.0.tgz",
+      "integrity": "sha512-aZW4l8Og16CokuCLf8CF8kq+KK2yOygapU5m3+hoGw0Mdosc6fPitjM+ujYarppj5ZIKGyPDPP1vqmQhr+5/0g==",
      "license": "MIT",
      "dependencies": {
-        "@shikijs/core": "1.29.2",
-        "@shikijs/engine-javascript": "1.29.2",
-        "@shikijs/engine-oniguruma": "1.29.2",
-        "@shikijs/langs": "1.29.2",
-        "@shikijs/themes": "1.29.2",
-        "@shikijs/types": "1.29.2",
-        "@shikijs/vscode-textmate": "^10.0.1",
+        "@shikijs/core": "3.13.0",
+        "@shikijs/engine-javascript": "3.13.0",
+        "@shikijs/engine-oniguruma": "3.13.0",
+        "@shikijs/langs": "3.13.0",
+        "@shikijs/themes": "3.13.0",
+        "@shikijs/types": "3.13.0",
+        "@shikijs/vscode-textmate": "^10.0.2",
        "@types/hast": "^3.0.4"
      }
    },
@ -13970,9 +13967,9 @@
      "license": "0BSD"
    },
    "node_modules/tw-animate-css": {
-      "version": "1.2.9",
-      "resolved": "https://registry.npmjs.org/tw-animate-css/-/tw-animate-css-1.2.9.tgz",
-      "integrity": "sha512-9O4k1at9pMQff9EAcCEuy1UNO43JmaPQvq+0lwza9Y0BQ6LB38NiMj+qHqjoQf40355MX+gs6wtlR6H9WsSXFg==",
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/tw-animate-css/-/tw-animate-css-1.4.0.tgz",
+      "integrity": "sha512-7bziOlRqH0hJx80h/3mbicLW7o8qLsH5+RaLR2t+OHM3D0JlWGODQKQ4cxbK7WlvmUxpcj6Kgu6EKqjrGFe3QQ==",
      "dev": true,
      "license": "MIT",
      "funding": {
--- a/llama_stack/ui/package.json
+++ b/llama_stack/ui/package.json
@ -33,7 +33,7 @@
    "react-markdown": "^10.1.0",
    "remark-gfm": "^4.0.1",
    "remeda": "^2.32.0",
-    "shiki": "^1.29.2",
+    "shiki": "^3.13.0",
    "sonner": "^2.0.7",
    "tailwind-merge": "^3.3.1"
  },
@ -56,7 +56,7 @@
    "prettier": "3.6.2",
    "tailwindcss": "^4",
    "ts-node": "^10.9.2",
-    "tw-animate-css": "^1.2.9",
+    "tw-animate-css": "^1.4.0",
    "typescript": "^5"
  }
 }
--- a/tests/common/mcp.py
+++ b/tests/common/mcp.py
@ -167,6 +167,8 @@ def make_mcp_server(required_auth_token: str | None = None, tools: dict[str, Cal
    from starlette.responses import Response
    from starlette.routing import Mount, Route

+    from llama_stack.log import get_logger
+
    server = FastMCP("FastMCP Test Server", log_level="WARNING")

    tools = tools or default_tools()
@ -211,6 +213,7 @@ def make_mcp_server(required_auth_token: str | None = None, tools: dict[str, Cal
            return sock.getsockname()[1]

    port = get_open_port()
+    logger = get_logger(__name__, category="tests::mcp")

    # make uvicorn logs be less verbose
    config = uvicorn.Config(app, host="0.0.0.0", port=port, log_level="warning")
@ -218,10 +221,17 @@ def make_mcp_server(required_auth_token: str | None = None, tools: dict[str, Cal
    app.state.uvicorn_server = server_instance

    def run_server():
-        server_instance.run()
+        try:
+            logger.info(f"Starting MCP server on port {port}")
+            server_instance.run()
+            logger.info(f"MCP server on port {port} has stopped")
+        except Exception as e:
+            logger.error(f"MCP server failed to start on port {port}: {e}")
+            raise

    # Start the server in a new thread
    server_thread = threading.Thread(target=run_server, daemon=True)
+    logger.info(f"Starting MCP server thread on port {port}")
    server_thread.start()

    # Polling until the server is ready
@ -229,24 +239,36 @@ def make_mcp_server(required_auth_token: str | None = None, tools: dict[str, Cal
    start_time = time.time()

    server_url = f"http://localhost:{port}/sse"
+    logger.info(f"Waiting for MCP server to be ready at {server_url}")
+
    while time.time() - start_time < timeout:
        try:
            response = httpx.get(server_url)
            if response.status_code in [200, 401]:
+                logger.info(f"MCP server is ready on port {port} (status: {response.status_code})")
                break
-        except httpx.RequestError:
+        except httpx.RequestError as e:
+            logger.debug(f"Server not ready yet, retrying... ({e})")
            pass
        time.sleep(0.1)
+    else:
+        # If we exit the loop due to timeout
+        logger.error(f"MCP server failed to start within {timeout} seconds on port {port}")
+        logger.error(f"Thread alive: {server_thread.is_alive()}")
+        if server_thread.is_alive():
+            logger.error("Server thread is still running but not responding to HTTP requests")

    try:
        yield {"server_url": server_url}
    finally:
+        logger.info(f"Shutting down MCP server on port {port}")
        server_instance.should_exit = True
        time.sleep(0.5)

        # Force shutdown if still running
        if server_thread.is_alive():
            try:
+                logger.info("Force shutting down server thread")
                if hasattr(server_instance, "servers") and server_instance.servers:
                    for srv in server_instance.servers:
                        srv.close()
@ -254,9 +276,9 @@ def make_mcp_server(required_auth_token: str | None = None, tools: dict[str, Cal
                # Wait for graceful shutdown
                server_thread.join(timeout=3)
                if server_thread.is_alive():
-                    print("Warning: Server thread still alive after shutdown attempt")
+                    logger.warning("Server thread still alive after shutdown attempt")
            except Exception as e:
-                print(f"Error during server shutdown: {e}")
+                logger.error(f"Error during server shutdown: {e}")

        # CRITICAL: Reset SSE global state to prevent event loop contamination
        # Reset the SSE AppStatus singleton that stores anyio.Event objects
--- a/tests/integration/inference/test_batch_inference.py
+++ b/tests/integration/inference/test_batch_inference.py
@ -1,76 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-import pytest
-
-from ..test_cases.test_case import TestCase
-
-
-def skip_if_provider_doesnt_support_batch_inference(client_with_models, model_id):
-    models = {m.identifier: m for m in client_with_models.models.list()}
-    models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
-    provider_id = models[model_id].provider_id
-    providers = {p.provider_id: p for p in client_with_models.providers.list()}
-    provider = providers[provider_id]
-    if provider.provider_type not in ("inline::meta-reference",):
-        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support batch inference")
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:batch_completion",
-    ],
-)
-def test_batch_completion_non_streaming(client_with_models, text_model_id, test_case):
-    skip_if_provider_doesnt_support_batch_inference(client_with_models, text_model_id)
-    tc = TestCase(test_case)
-
-    content_batch = tc["contents"]
-    response = client_with_models.inference.batch_completion(
-        content_batch=content_batch,
-        model_id=text_model_id,
-        sampling_params={
-            "max_tokens": 50,
-        },
-    )
-    assert len(response.batch) == len(content_batch)
-    for i, r in enumerate(response.batch):
-        print(f"response {i}: {r.content}")
-        assert len(r.content) > 10
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:batch_completion",
-    ],
-)
-def test_batch_chat_completion_non_streaming(client_with_models, text_model_id, test_case):
-    skip_if_provider_doesnt_support_batch_inference(client_with_models, text_model_id)
-    tc = TestCase(test_case)
-    qa_pairs = tc["qa_pairs"]
-
-    message_batch = [
-        [
-            {
-                "role": "user",
-                "content": qa["question"],
-            }
-        ]
-        for qa in qa_pairs
-    ]
-
-    response = client_with_models.inference.batch_chat_completion(
-        messages_batch=message_batch,
-        model_id=text_model_id,
-    )
-    assert len(response.batch) == len(qa_pairs)
-    for i, r in enumerate(response.batch):
-        print(f"response {i}: {r.completion_message.content}")
-        assert len(r.completion_message.content) > 0
-        assert qa_pairs[i]["answer"].lower() in r.completion_message.content.lower()
--- a/tests/integration/inference/test_embedding.py
+++ b/tests/integration/inference/test_embedding.py
@ -1,303 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-#
-# Test plan:
-#
-#  Types of input:
-#   - array of a string
-#   - array of a image (ImageContentItem, either URL or base64 string)
-#   - array of a text (TextContentItem)
-#  Types of output:
-#   - list of list of floats
-#  Params:
-#   - text_truncation
-#     - absent w/ long text -> error
-#     - none w/ long text -> error
-#     - absent w/ short text -> ok
-#     - none w/ short text -> ok
-#     - end w/ long text -> ok
-#     - end w/ short text -> ok
-#     - start w/ long text -> ok
-#     - start w/ short text -> ok
-#   - output_dimension
-#     - response dimension matches
-#   - task_type, only for asymmetric models
-#     - query embedding != passage embedding
-#  Negative:
-#   - long string
-#   - long text
-#
-# Todo:
-#  - negative tests
-#    - empty
-#      - empty list
-#      - empty string
-#      - empty text
-#      - empty image
-#    - long
-#      - large image
-#      - appropriate combinations
-#    - batch size
-#      - many inputs
-#    - invalid
-#      - invalid URL
-#      - invalid base64
-#
-# Notes:
-#  - use llama_stack_client fixture
-#  - use pytest.mark.parametrize when possible
-#  - no accuracy tests: only check the type of output, not the content
-#
-
-import pytest
-from llama_stack_client import BadRequestError as LlamaStackBadRequestError
-from llama_stack_client.types import EmbeddingsResponse
-from llama_stack_client.types.shared.interleaved_content import (
-    ImageContentItem,
-    ImageContentItemImage,
-    ImageContentItemImageURL,
-    TextContentItem,
-)
-from openai import BadRequestError as OpenAIBadRequestError
-
-from llama_stack.core.library_client import LlamaStackAsLibraryClient
-
-DUMMY_STRING = "hello"
-DUMMY_STRING2 = "world"
-DUMMY_LONG_STRING = "NVDA " * 10240
-DUMMY_TEXT = TextContentItem(text=DUMMY_STRING, type="text")
-DUMMY_TEXT2 = TextContentItem(text=DUMMY_STRING2, type="text")
-DUMMY_LONG_TEXT = TextContentItem(text=DUMMY_LONG_STRING, type="text")
-# TODO(mf): add a real image URL and base64 string
-DUMMY_IMAGE_URL = ImageContentItem(
-    image=ImageContentItemImage(url=ImageContentItemImageURL(uri="https://example.com/image.jpg")), type="image"
-)
-DUMMY_IMAGE_BASE64 = ImageContentItem(image=ImageContentItemImage(data="base64string"), type="image")
-SUPPORTED_PROVIDERS = {"remote::nvidia"}
-MODELS_SUPPORTING_MEDIA = {}
-MODELS_SUPPORTING_OUTPUT_DIMENSION = {"nvidia/llama-3.2-nv-embedqa-1b-v2"}
-MODELS_REQUIRING_TASK_TYPE = {
-    "nvidia/llama-3.2-nv-embedqa-1b-v2",
-    "nvidia/nv-embedqa-e5-v5",
-    "nvidia/nv-embedqa-mistral-7b-v2",
-    "snowflake/arctic-embed-l",
-}
-MODELS_SUPPORTING_TASK_TYPE = MODELS_REQUIRING_TASK_TYPE
-
-
-def default_task_type(model_id):
-    """
-    Some models require a task type parameter. This provides a default value for
-    testing those models.
-    """
-    if model_id in MODELS_REQUIRING_TASK_TYPE:
-        return {"task_type": "query"}
-    return {}
-
-
-@pytest.mark.parametrize(
-    "contents",
-    [
-        [DUMMY_STRING, DUMMY_STRING2],
-        [DUMMY_TEXT, DUMMY_TEXT2],
-    ],
-    ids=[
-        "list[string]",
-        "list[text]",
-    ],
-)
-def test_embedding_text(llama_stack_client, embedding_model_id, contents, inference_provider_type):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id, contents=contents, **default_task_type(embedding_model_id)
-    )
-    assert isinstance(response, EmbeddingsResponse)
-    assert len(response.embeddings) == sum(len(content) if isinstance(content, list) else 1 for content in contents)
-    assert isinstance(response.embeddings[0], list)
-    assert isinstance(response.embeddings[0][0], float)
-
-
-@pytest.mark.parametrize(
-    "contents",
-    [
-        [DUMMY_IMAGE_URL, DUMMY_IMAGE_BASE64],
-        [DUMMY_IMAGE_URL, DUMMY_STRING, DUMMY_IMAGE_BASE64, DUMMY_TEXT],
-    ],
-    ids=[
-        "list[url,base64]",
-        "list[url,string,base64,text]",
-    ],
-)
-def test_embedding_image(llama_stack_client, embedding_model_id, contents, inference_provider_type):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    if embedding_model_id not in MODELS_SUPPORTING_MEDIA:
-        pytest.xfail(f"{embedding_model_id} doesn't support media")
-    response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id, contents=contents, **default_task_type(embedding_model_id)
-    )
-    assert isinstance(response, EmbeddingsResponse)
-    assert len(response.embeddings) == sum(len(content) if isinstance(content, list) else 1 for content in contents)
-    assert isinstance(response.embeddings[0], list)
-    assert isinstance(response.embeddings[0][0], float)
-
-
-@pytest.mark.parametrize(
-    "text_truncation",
-    [
-        "end",
-        "start",
-    ],
-)
-@pytest.mark.parametrize(
-    "contents",
-    [
-        [DUMMY_LONG_TEXT],
-        [DUMMY_STRING],
-    ],
-    ids=[
-        "long",
-        "short",
-    ],
-)
-def test_embedding_truncation(
-    llama_stack_client, embedding_model_id, text_truncation, contents, inference_provider_type
-):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id,
-        contents=contents,
-        text_truncation=text_truncation,
-        **default_task_type(embedding_model_id),
-    )
-    assert isinstance(response, EmbeddingsResponse)
-    assert len(response.embeddings) == 1
-    assert isinstance(response.embeddings[0], list)
-    assert isinstance(response.embeddings[0][0], float)
-
-
-@pytest.mark.parametrize(
-    "text_truncation",
-    [
-        None,
-        "none",
-    ],
-)
-@pytest.mark.parametrize(
-    "contents",
-    [
-        [DUMMY_LONG_TEXT],
-        [DUMMY_LONG_STRING],
-    ],
-    ids=[
-        "long-text",
-        "long-str",
-    ],
-)
-def test_embedding_truncation_error(
-    llama_stack_client, embedding_model_id, text_truncation, contents, inference_provider_type
-):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    # Using LlamaStackClient from llama_stack_client will raise llama_stack_client.BadRequestError
-    # While using LlamaStackAsLibraryClient from llama_stack.distribution.library_client will raise the error that the backend raises
-    error_type = (
-        OpenAIBadRequestError
-        if isinstance(llama_stack_client, LlamaStackAsLibraryClient)
-        else LlamaStackBadRequestError
-    )
-    with pytest.raises(error_type):
-        llama_stack_client.inference.embeddings(
-            model_id=embedding_model_id,
-            contents=[DUMMY_LONG_TEXT],
-            text_truncation=text_truncation,
-            **default_task_type(embedding_model_id),
-        )
-
-
-def test_embedding_output_dimension(llama_stack_client, embedding_model_id, inference_provider_type):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    if embedding_model_id not in MODELS_SUPPORTING_OUTPUT_DIMENSION:
-        pytest.xfail(f"{embedding_model_id} doesn't support output_dimension")
-    base_response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id, contents=[DUMMY_STRING], **default_task_type(embedding_model_id)
-    )
-    test_response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id,
-        contents=[DUMMY_STRING],
-        **default_task_type(embedding_model_id),
-        output_dimension=32,
-    )
-    assert len(base_response.embeddings[0]) != len(test_response.embeddings[0])
-    assert len(test_response.embeddings[0]) == 32
-
-
-def test_embedding_task_type(llama_stack_client, embedding_model_id, inference_provider_type):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    if embedding_model_id not in MODELS_SUPPORTING_TASK_TYPE:
-        pytest.xfail(f"{embedding_model_id} doesn't support task_type")
-    query_embedding = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id, contents=[DUMMY_STRING], task_type="query"
-    )
-    document_embedding = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id, contents=[DUMMY_STRING], task_type="document"
-    )
-    assert query_embedding.embeddings != document_embedding.embeddings
-
-
-@pytest.mark.parametrize(
-    "text_truncation",
-    [
-        None,
-        "none",
-        "end",
-        "start",
-    ],
-)
-def test_embedding_text_truncation(llama_stack_client, embedding_model_id, text_truncation, inference_provider_type):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id,
-        contents=[DUMMY_STRING],
-        text_truncation=text_truncation,
-        **default_task_type(embedding_model_id),
-    )
-    assert isinstance(response, EmbeddingsResponse)
-    assert len(response.embeddings) == 1
-    assert isinstance(response.embeddings[0], list)
-    assert isinstance(response.embeddings[0][0], float)
-
-
-@pytest.mark.parametrize(
-    "text_truncation",
-    [
-        "NONE",
-        "END",
-        "START",
-        "left",
-        "right",
-    ],
-)
-def test_embedding_text_truncation_error(
-    llama_stack_client, embedding_model_id, text_truncation, inference_provider_type
-):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    error_type = ValueError if isinstance(llama_stack_client, LlamaStackAsLibraryClient) else LlamaStackBadRequestError
-    with pytest.raises(error_type):
-        llama_stack_client.inference.embeddings(
-            model_id=embedding_model_id,
-            contents=[DUMMY_STRING],
-            text_truncation=text_truncation,
-            **default_task_type(embedding_model_id),
-        )
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@ -9,6 +9,7 @@ import time
 import unicodedata

 import pytest
+from pydantic import BaseModel

 from ..test_cases.test_case import TestCase

@ -62,6 +63,14 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")


+def skip_if_doesnt_support_completions_logprobs(client_with_models, model_id):
+    provider_type = provider_from_model(client_with_models, model_id).provider_type
+    if provider_type in (
+        "remote::ollama",  # logprobs is ignored
+    ):
+        pytest.skip(f"Model {model_id} hosted by {provider_type} doesn't support /v1/completions logprobs.")
+
+
 def skip_if_model_doesnt_support_suffix(client_with_models, model_id):
    # To test `fim` ( fill in the middle ) completion, we need to use a model that supports suffix.
    # Use this to specifically test this API functionality.
@ -205,28 +214,6 @@ def test_openai_completion_streaming(llama_stack_client, client_with_models, tex
    assert len(content_str) > 10


-@pytest.mark.parametrize(
-    "prompt_logprobs",
-    [
-        1,
-        0,
-    ],
-)
-def test_openai_completion_prompt_logprobs(llama_stack_client, client_with_models, text_model_id, prompt_logprobs):
-    skip_if_provider_isnt_vllm(client_with_models, text_model_id)
-
-    prompt = "Hello, world!"
-    response = llama_stack_client.completions.create(
-        model=text_model_id,
-        prompt=prompt,
-        stream=False,
-        prompt_logprobs=prompt_logprobs,
-    )
-    assert len(response.choices) > 0
-    choice = response.choices[0]
-    assert len(choice.prompt_logprobs) > 0
-
-
 def test_openai_completion_guided_choice(llama_stack_client, client_with_models, text_model_id):
    skip_if_provider_isnt_vllm(client_with_models, text_model_id)

@ -518,3 +505,214 @@ def test_openai_chat_completion_non_streaming_with_file(openai_client, client_wi
    message_content = response.choices[0].message.content.lower().strip()
    normalized_content = _normalize_text(message_content)
    assert "hello world" in normalized_content
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:completion:stop_sequence",
+    ],
+)
+def test_openai_completion_stop_sequence(client_with_models, openai_client, text_model_id, test_case):
+    skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
+
+    tc = TestCase(test_case)
+
+    response = openai_client.completions.create(
+        model=text_model_id,
+        prompt=tc["content"],
+        stop="1963",
+        stream=False,
+    )
+    assert len(response.choices) > 0
+    choice = response.choices[0]
+    assert "1963" not in choice.text
+
+    response = openai_client.completions.create(
+        model=text_model_id,
+        prompt=tc["content"],
+        stop=["blathering", "1963"],
+        stream=False,
+    )
+    assert len(response.choices) > 0
+    choice = response.choices[0]
+    assert "1963" not in choice.text
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:completion:log_probs",
+    ],
+)
+def test_openai_completion_logprobs(client_with_models, openai_client, text_model_id, test_case):
+    skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
+    skip_if_doesnt_support_completions_logprobs(client_with_models, text_model_id)
+
+    tc = TestCase(test_case)
+
+    response = openai_client.completions.create(
+        model=text_model_id,
+        prompt=tc["content"],
+        logprobs=5,
+    )
+    assert len(response.choices) > 0
+    choice = response.choices[0]
+    assert choice.text, "Response text should not be empty"
+    assert choice.logprobs, "Logprobs should not be empty"
+    logprobs = choice.logprobs
+    assert logprobs.token_logprobs, "Response tokens should not be empty"
+    assert len(logprobs.tokens) == len(logprobs.token_logprobs)
+    assert len(logprobs.token_logprobs) == len(logprobs.top_logprobs)
+    for i, (token, prob) in enumerate(zip(logprobs.tokens, logprobs.token_logprobs, strict=True)):
+        assert logprobs.top_logprobs[i][token] == prob
+        assert len(logprobs.top_logprobs[i]) == 5
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:completion:log_probs",
+    ],
+)
+def test_openai_completion_logprobs_streaming(client_with_models, openai_client, text_model_id, test_case):
+    skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
+    skip_if_doesnt_support_completions_logprobs(client_with_models, text_model_id)
+
+    tc = TestCase(test_case)
+
+    response = openai_client.completions.create(
+        model=text_model_id,
+        prompt=tc["content"],
+        logprobs=3,
+        stream=True,
+        max_tokens=5,
+    )
+    for chunk in response:
+        choice = chunk.choices[0]
+        choice = response.choices[0]
+        if choice.text:  # if there's a token, we expect logprobs
+            assert choice.logprobs, "Logprobs should not be empty"
+            logprobs = choice.logprobs
+            assert logprobs.token_logprobs, "Response tokens should not be empty"
+            assert len(logprobs.tokens) == len(logprobs.token_logprobs)
+            assert len(logprobs.token_logprobs) == len(logprobs.top_logprobs)
+            for i, (token, prob) in enumerate(zip(logprobs.tokens, logprobs.token_logprobs, strict=True)):
+                assert logprobs.top_logprobs[i][token] == prob
+                assert len(logprobs.top_logprobs[i]) == 3
+        else:  # no token, no logprobs
+            assert not choice.logprobs, "Logprobs should be empty"
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:tool_calling",
+    ],
+)
+def test_openai_chat_completion_with_tools(openai_client, text_model_id, test_case):
+    tc = TestCase(test_case)
+
+    response = openai_client.chat.completions.create(
+        model=text_model_id,
+        messages=tc["messages"],
+        tools=tc["tools"],
+        tool_choice="auto",
+        stream=False,
+    )
+    assert len(response.choices) == 1
+    assert len(response.choices[0].message.tool_calls) == 1
+    tool_call = response.choices[0].message.tool_calls[0]
+    assert tool_call.function.name == tc["tools"][0]["function"]["name"]
+    assert "location" in tool_call.function.arguments
+    assert tc["expected"]["location"] in tool_call.function.arguments
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:tool_calling",
+    ],
+)
+def test_openai_chat_completion_with_tools_and_streaming(openai_client, text_model_id, test_case):
+    tc = TestCase(test_case)
+
+    response = openai_client.chat.completions.create(
+        model=text_model_id,
+        messages=tc["messages"],
+        tools=tc["tools"],
+        tool_choice="auto",
+        stream=True,
+    )
+    # Accumulate tool calls from streaming chunks
+    tool_calls = []
+    for chunk in response:
+        if chunk.choices and chunk.choices[0].delta.tool_calls:
+            for i, tc_delta in enumerate(chunk.choices[0].delta.tool_calls):
+                while len(tool_calls) <= i:
+                    tool_calls.append({"function": {"name": "", "arguments": ""}})
+                if tc_delta.function and tc_delta.function.name:
+                    tool_calls[i]["function"]["name"] = tc_delta.function.name
+                if tc_delta.function and tc_delta.function.arguments:
+                    tool_calls[i]["function"]["arguments"] += tc_delta.function.arguments
+    assert len(tool_calls) == 1
+    tool_call = tool_calls[0]
+    assert tool_call["function"]["name"] == tc["tools"][0]["function"]["name"]
+    assert "location" in tool_call["function"]["arguments"]
+    assert tc["expected"]["location"] in tool_call["function"]["arguments"]
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:tool_calling",
+    ],
+)
+def test_openai_chat_completion_with_tool_choice_none(openai_client, text_model_id, test_case):
+    tc = TestCase(test_case)
+
+    response = openai_client.chat.completions.create(
+        model=text_model_id,
+        messages=tc["messages"],
+        tools=tc["tools"],
+        tool_choice="none",
+        stream=False,
+    )
+    assert len(response.choices) == 1
+    tool_calls = response.choices[0].message.tool_calls
+    assert tool_calls is None or len(tool_calls) == 0
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:structured_output",
+    ],
+)
+def test_openai_chat_completion_structured_output(openai_client, text_model_id, test_case):
+    # Note: Skip condition may need adjustment for OpenAI client
+    class AnswerFormat(BaseModel):
+        first_name: str
+        last_name: str
+        year_of_birth: int
+
+    tc = TestCase(test_case)
+
+    response = openai_client.chat.completions.create(
+        model=text_model_id,
+        messages=tc["messages"],
+        response_format={
+            "type": "json_schema",
+            "json_schema": {
+                "name": "AnswerFormat",
+                "schema": AnswerFormat.model_json_schema(),
+            },
+        },
+        stream=False,
+    )
+    print(response.choices[0].message.content)
+    answer = AnswerFormat.model_validate_json(response.choices[0].message.content)
+    expected = tc["expected"]
+    assert answer.first_name == expected["first_name"]
+    assert answer.last_name == expected["last_name"]
+    assert answer.year_of_birth == expected["year_of_birth"]
--- a/tests/integration/inference/test_openai_vision_inference.py
+++ b/tests/integration/inference/test_openai_vision_inference.py
@ -0,0 +1,77 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+import base64
+import pathlib
+
+import pytest
+
+
+@pytest.fixture
+def image_path():
+    return pathlib.Path(__file__).parent / "dog.png"
+
+
+@pytest.fixture
+def base64_image_data(image_path):
+    return base64.b64encode(image_path.read_bytes()).decode("utf-8")
+
+
+async def test_openai_chat_completion_image_url(openai_client, vision_model_id):
+    message = {
+        "role": "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
+                },
+            },
+            {
+                "type": "text",
+                "text": "Describe what is in this image.",
+            },
+        ],
+    }
+
+    response = openai_client.chat.completions.create(
+        model=vision_model_id,
+        messages=[message],
+        stream=False,
+    )
+
+    message_content = response.choices[0].message.content.lower().strip()
+    assert len(message_content) > 0
+    assert any(expected in message_content for expected in {"dog", "puppy", "pup"})
+
+
+async def test_openai_chat_completion_image_data(openai_client, vision_model_id, base64_image_data):
+    message = {
+        "role": "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/png;base64,{base64_image_data}",
+                },
+            },
+            {
+                "type": "text",
+                "text": "Describe what is in this image.",
+            },
+        ],
+    }
+
+    response = openai_client.chat.completions.create(
+        model=vision_model_id,
+        messages=[message],
+        stream=False,
+    )
+
+    message_content = response.choices[0].message.content.lower().strip()
+    assert len(message_content) > 0
+    assert any(expected in message_content for expected in {"dog", "puppy", "pup"})
--- a/tests/integration/inference/test_text_inference.py
+++ b/tests/integration/inference/test_text_inference.py
@ -1,545 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from time import sleep
-
-import pytest
-from pydantic import BaseModel
-
-from llama_stack.models.llama.sku_list import resolve_model
-
-from ..test_cases.test_case import TestCase
-
-PROVIDER_LOGPROBS_TOP_K = {"remote::together", "remote::fireworks", "remote::vllm"}
-
-
-def skip_if_model_doesnt_support_completion(client_with_models, model_id):
-    models = {m.identifier: m for m in client_with_models.models.list()}
-    models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
-    provider_id = models[model_id].provider_id
-    providers = {p.provider_id: p for p in client_with_models.providers.list()}
-    provider = providers[provider_id]
-    if (
-        provider.provider_type
-        in (
-            "remote::openai",
-            "remote::anthropic",
-            "remote::gemini",
-            "remote::vertexai",
-            "remote::groq",
-            "remote::sambanova",
-            "remote::azure",
-        )
-        or "openai-compat" in provider.provider_type
-    ):
-        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support completion")
-
-
-def skip_if_model_doesnt_support_json_schema_structured_output(client_with_models, model_id):
-    models = {m.identifier: m for m in client_with_models.models.list()}
-    models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
-    provider_id = models[model_id].provider_id
-    providers = {p.provider_id: p for p in client_with_models.providers.list()}
-    provider = providers[provider_id]
-    if provider.provider_type in ("remote::sambanova", "remote::azure", "remote::watsonx"):
-        pytest.skip(
-            f"Model {model_id} hosted by {provider.provider_type} doesn't support json_schema structured output"
-        )
-
-
-def get_llama_model(client_with_models, model_id):
-    models = {}
-    for m in client_with_models.models.list():
-        models[m.identifier] = m
-        models[m.provider_resource_id] = m
-
-    assert model_id in models, f"Model {model_id} not found"
-
-    model = models[model_id]
-    ids = (model.identifier, model.provider_resource_id)
-    for mid in ids:
-        if resolve_model(mid):
-            return mid
-
-    return model.metadata.get("llama_model", None)
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:sanity",
-    ],
-)
-def test_text_completion_non_streaming(client_with_models, text_model_id, test_case):
-    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.completion(
-        content=tc["content"],
-        stream=False,
-        model_id=text_model_id,
-        sampling_params={
-            "max_tokens": 50,
-        },
-    )
-    assert len(response.content) > 10
-    # assert "blue" in response.content.lower().strip()
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:sanity",
-    ],
-)
-def test_text_completion_streaming(client_with_models, text_model_id, test_case):
-    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.completion(
-        content=tc["content"],
-        stream=True,
-        model_id=text_model_id,
-        sampling_params={
-            "max_tokens": 50,
-        },
-    )
-    streamed_content = [chunk.delta for chunk in response]
-    content_str = "".join(streamed_content).lower().strip()
-    # assert "blue" in content_str
-    assert len(content_str) > 10
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:stop_sequence",
-    ],
-)
-def test_text_completion_stop_sequence(client_with_models, text_model_id, inference_provider_type, test_case):
-    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
-    # This is only supported/tested for remote vLLM: https://github.com/meta-llama/llama-stack/issues/1771
-    if inference_provider_type != "remote::vllm":
-        pytest.xfail(f"{inference_provider_type} doesn't support 'stop' parameter yet")
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.completion(
-        content=tc["content"],
-        stream=True,
-        model_id=text_model_id,
-        sampling_params={
-            "max_tokens": 50,
-            "stop": ["1963"],
-        },
-    )
-    streamed_content = [chunk.delta for chunk in response]
-    content_str = "".join(streamed_content).lower().strip()
-    assert "1963" not in content_str
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:log_probs",
-    ],
-)
-def test_text_completion_log_probs_non_streaming(client_with_models, text_model_id, inference_provider_type, test_case):
-    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
-    if inference_provider_type not in PROVIDER_LOGPROBS_TOP_K:
-        pytest.xfail(f"{inference_provider_type} doesn't support log probs yet")
-
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.completion(
-        content=tc["content"],
-        stream=False,
-        model_id=text_model_id,
-        sampling_params={
-            "max_tokens": 5,
-        },
-        logprobs={
-            "top_k": 1,
-        },
-    )
-    assert response.logprobs, "Logprobs should not be empty"
-    assert 1 <= len(response.logprobs) <= 5  # each token has 1 logprob and here max_tokens=5
-    assert all(len(logprob.logprobs_by_token) == 1 for logprob in response.logprobs)
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:log_probs",
-    ],
-)
-def test_text_completion_log_probs_streaming(client_with_models, text_model_id, inference_provider_type, test_case):
-    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
-    if inference_provider_type not in PROVIDER_LOGPROBS_TOP_K:
-        pytest.xfail(f"{inference_provider_type} doesn't support log probs yet")
-
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.completion(
-        content=tc["content"],
-        stream=True,
-        model_id=text_model_id,
-        sampling_params={
-            "max_tokens": 5,
-        },
-        logprobs={
-            "top_k": 1,
-        },
-    )
-    streamed_content = list(response)
-    for chunk in streamed_content:
-        if chunk.delta:  # if there's a token, we expect logprobs
-            assert chunk.logprobs, "Logprobs should not be empty"
-            assert all(len(logprob.logprobs_by_token) == 1 for logprob in chunk.logprobs)
-        else:  # no token, no logprobs
-            assert not chunk.logprobs, "Logprobs should be empty"
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:structured_output",
-    ],
-)
-def test_text_completion_structured_output(client_with_models, text_model_id, test_case):
-    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
-    skip_if_model_doesnt_support_json_schema_structured_output(client_with_models, text_model_id)
-
-    class AnswerFormat(BaseModel):
-        name: str
-        year_born: str
-        year_retired: str
-
-    tc = TestCase(test_case)
-
-    user_input = tc["user_input"]
-    response = client_with_models.inference.completion(
-        model_id=text_model_id,
-        content=user_input,
-        stream=False,
-        sampling_params={
-            "max_tokens": 50,
-        },
-        response_format={
-            "type": "json_schema",
-            "json_schema": AnswerFormat.model_json_schema(),
-        },
-    )
-    answer = AnswerFormat.model_validate_json(response.content)
-    expected = tc["expected"]
-    assert answer.name == expected["name"]
-    assert answer.year_born == expected["year_born"]
-    assert answer.year_retired == expected["year_retired"]
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:non_streaming_01",
-        "inference:chat_completion:non_streaming_02",
-    ],
-)
-def test_text_chat_completion_non_streaming(client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-    question = tc["question"]
-    expected = tc["expected"]
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=[
-            {
-                "role": "user",
-                "content": question,
-            }
-        ],
-        stream=False,
-    )
-    message_content = response.completion_message.content.lower().strip()
-    assert len(message_content) > 0
-    assert expected.lower() in message_content
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:streaming_01",
-        "inference:chat_completion:streaming_02",
-    ],
-)
-def test_text_chat_completion_streaming(client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-    question = tc["question"]
-    expected = tc["expected"]
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=[{"role": "user", "content": question}],
-        stream=True,
-        timeout=120,  # Increase timeout to 2 minutes for large conversation history
-    )
-    streamed_content = [str(chunk.event.delta.text.lower().strip()) for chunk in response]
-    assert len(streamed_content) > 0
-    assert expected.lower() in "".join(streamed_content)
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:tool_calling",
-    ],
-)
-def test_text_chat_completion_with_tool_calling_and_non_streaming(client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=tc["messages"],
-        tools=tc["tools"],
-        tool_choice="auto",
-        stream=False,
-    )
-    # some models can return content for the response in addition to the tool call
-    assert response.completion_message.role == "assistant"
-
-    assert len(response.completion_message.tool_calls) == 1
-    assert response.completion_message.tool_calls[0].tool_name == tc["tools"][0]["tool_name"]
-    assert response.completion_message.tool_calls[0].arguments == tc["expected"]
-
-
-# Will extract streamed text and separate it from tool invocation content
-# The returned tool inovcation content will be a string so it's easy to comapare with expected value
-# e.g. "[get_weather, {'location': 'San Francisco, CA'}]"
-def extract_tool_invocation_content(response):
-    tool_invocation_content: str = ""
-    for chunk in response:
-        delta = chunk.event.delta
-        if delta.type == "tool_call" and delta.parse_status == "succeeded":
-            call = delta.tool_call
-            tool_invocation_content += f"[{call.tool_name}, {call.arguments}]"
-    return tool_invocation_content
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:tool_calling",
-    ],
-)
-def test_text_chat_completion_with_tool_calling_and_streaming(client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=tc["messages"],
-        tools=tc["tools"],
-        tool_choice="auto",
-        stream=True,
-    )
-    tool_invocation_content = extract_tool_invocation_content(response)
-    expected_tool_name = tc["tools"][0]["tool_name"]
-    expected_argument = tc["expected"]
-    assert tool_invocation_content == f"[{expected_tool_name}, {expected_argument}]"
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:tool_calling",
-    ],
-)
-def test_text_chat_completion_with_tool_choice_required(client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=tc["messages"],
-        tools=tc["tools"],
-        tool_config={
-            "tool_choice": "required",
-        },
-        stream=True,
-    )
-    tool_invocation_content = extract_tool_invocation_content(response)
-    expected_tool_name = tc["tools"][0]["tool_name"]
-    expected_argument = tc["expected"]
-    assert tool_invocation_content == f"[{expected_tool_name}, {expected_argument}]"
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:tool_calling",
-    ],
-)
-def test_text_chat_completion_with_tool_choice_none(client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=tc["messages"],
-        tools=tc["tools"],
-        tool_config={"tool_choice": "none"},
-        stream=True,
-    )
-    tool_invocation_content = extract_tool_invocation_content(response)
-    assert tool_invocation_content == ""
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:structured_output",
-    ],
-)
-def test_text_chat_completion_structured_output(client_with_models, text_model_id, test_case):
-    skip_if_model_doesnt_support_json_schema_structured_output(client_with_models, text_model_id)
-
-    class NBAStats(BaseModel):
-        year_for_draft: int
-        num_seasons_in_nba: int
-
-    class AnswerFormat(BaseModel):
-        first_name: str
-        last_name: str
-        year_of_birth: int
-        nba_stats: NBAStats
-
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=tc["messages"],
-        response_format={
-            "type": "json_schema",
-            "json_schema": AnswerFormat.model_json_schema(),
-        },
-        stream=False,
-    )
-    answer = AnswerFormat.model_validate_json(response.completion_message.content)
-    expected = tc["expected"]
-    assert answer.first_name == expected["first_name"]
-    assert answer.last_name == expected["last_name"]
-    assert answer.year_of_birth == expected["year_of_birth"]
-    assert answer.nba_stats.num_seasons_in_nba == expected["num_seasons_in_nba"]
-    assert answer.nba_stats.year_for_draft == expected["year_for_draft"]
-
-
-@pytest.mark.parametrize("streaming", [True, False])
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:tool_calling_tools_absent",
-    ],
-)
-def test_text_chat_completion_tool_calling_tools_not_in_request(
-    client_with_models, text_model_id, test_case, streaming
-):
-    tc = TestCase(test_case)
-
-    # TODO: more dynamic lookup on tool_prompt_format for model family
-    tool_prompt_format = "json" if "3.1" in text_model_id else "python_list"
-    request = {
-        "model_id": text_model_id,
-        "messages": tc["messages"],
-        "tools": tc["tools"],
-        "tool_choice": "auto",
-        "tool_prompt_format": tool_prompt_format,
-        "stream": streaming,
-    }
-
-    response = client_with_models.inference.chat_completion(**request)
-
-    if streaming:
-        for chunk in response:
-            delta = chunk.event.delta
-            if delta.type == "tool_call" and delta.parse_status == "succeeded":
-                assert delta.tool_call.tool_name == "get_object_namespace_list"
-            if delta.type == "tool_call" and delta.parse_status == "failed":
-                # expect raw message that failed to parse in tool_call
-                assert isinstance(delta.tool_call, str)
-                assert len(delta.tool_call) > 0
-    else:
-        for tc in response.completion_message.tool_calls:
-            assert tc.tool_name == "get_object_namespace_list"
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        # Tests if the model can handle simple messages like "Hi" or
-        # a message unrelated to one of the tool calls
-        "inference:chat_completion:text_then_tool",
-        # Tests if the model can do full tool call with responses correctly
-        "inference:chat_completion:tool_then_answer",
-        # Tests if model can generate multiple params and
-        # read outputs correctly
-        "inference:chat_completion:array_parameter",
-    ],
-)
-def test_text_chat_completion_with_multi_turn_tool_calling(client_with_models, text_model_id, test_case):
-    """This test tests the model's tool calling loop in various scenarios"""
-    if "llama-4" not in text_model_id.lower() and "llama4" not in text_model_id.lower():
-        pytest.xfail("Not tested for non-llama4 models yet")
-
-    tc = TestCase(test_case)
-    messages = []
-
-    # keep going until either
-    # 1. we have messages to test in multi-turn
-    # 2. no messages bust last message is tool response
-    while len(tc["messages"]) > 0 or (len(messages) > 0 and messages[-1]["role"] == "tool"):
-        # do not take new messages if last message is tool response
-        if len(messages) == 0 or messages[-1]["role"] != "tool":
-            new_messages = tc["messages"].pop(0)
-            messages += new_messages
-
-        # pprint(messages)
-        response = client_with_models.inference.chat_completion(
-            model_id=text_model_id,
-            messages=messages,
-            tools=tc["tools"],
-            stream=False,
-            sampling_params={
-                "strategy": {
-                    "type": "top_p",
-                    "top_p": 0.9,
-                    "temperature": 0.6,
-                }
-            },
-        )
-        op_msg = response.completion_message
-        messages.append(op_msg.model_dump())
-        # print(op_msg)
-
-        assert op_msg.role == "assistant"
-        expected = tc["expected"].pop(0)
-        assert len(op_msg.tool_calls) == expected["num_tool_calls"]
-
-        if expected["num_tool_calls"] > 0:
-            assert op_msg.tool_calls[0].tool_name == expected["tool_name"]
-            assert op_msg.tool_calls[0].arguments == expected["tool_arguments"]
-
-            tool_response = tc["tool_responses"].pop(0)
-            messages.append(
-                # Tool Response Message
-                {
-                    "role": "tool",
-                    "call_id": op_msg.tool_calls[0].call_id,
-                    "content": tool_response["response"],
-                }
-            )
-        else:
-            actual_answer = op_msg.content.lower()
-            # pprint(actual_answer)
-            assert expected["answer"] in actual_answer
-
-        # sleep to avoid rate limit
-        sleep(1)
--- a/tests/integration/inference/test_vision_inference.py
+++ b/tests/integration/inference/test_vision_inference.py
@ -25,16 +25,19 @@ def base64_image_data(image_path):
    return base64.b64encode(image_path.read_bytes()).decode("utf-8")


+@pytest.fixture
+def base64_image_url(base64_image_data):
+    return f"data:image/png;base64,{base64_image_data}"
+
+
 def test_image_chat_completion_non_streaming(client_with_models, vision_model_id):
    message = {
        "role": "user",
        "content": [
            {
-                "type": "image",
-                "image": {
-                    "url": {
-                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
-                    },
+                "type": "image_url",
+                "image_url": {
+                    "url": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
                },
            },
            {
@ -43,12 +46,12 @@ def test_image_chat_completion_non_streaming(client_with_models, vision_model_id
            },
        ],
    }
-    response = client_with_models.inference.chat_completion(
-        model_id=vision_model_id,
+    response = client_with_models.chat.completions.create(
+        model=vision_model_id,
        messages=[message],
        stream=False,
    )
-    message_content = response.completion_message.content.lower().strip()
+    message_content = response.choices[0].message.content.lower().strip()
    assert len(message_content) > 0
    assert any(expected in message_content for expected in {"dog", "puppy", "pup"})

@ -68,8 +71,13 @@ def multi_image_data():
    return encoded_files


+@pytest.fixture
+def multi_image_url(multi_image_data):
+    return [f"data:image/jpeg;base64,{data}" for data in multi_image_data]
+
+
@pytest.mark.parametrize("stream", [True, False])
-def test_image_chat_completion_multiple_images(client_with_models, vision_model_id, multi_image_data, stream):
+def test_image_chat_completion_multiple_images(client_with_models, vision_model_id, multi_image_url, stream):
    supported_models = ["llama-4", "gpt-4o", "llama4"]
    if not any(model in vision_model_id.lower() for model in supported_models):
        pytest.skip(
@ -81,15 +89,15 @@ def test_image_chat_completion_multiple_images(client_with_models, vision_model_
            "role": "user",
            "content": [
                {
-                    "type": "image",
-                    "image": {
-                        "data": multi_image_data[0],
+                    "type": "image_url",
+                    "image_url": {
+                        "url": multi_image_url[0],
                    },
                },
                {
-                    "type": "image",
-                    "image": {
-                        "data": multi_image_data[1],
+                    "type": "image_url",
+                    "image_url": {
+                        "url": multi_image_url[1],
                    },
                },
                {
@ -99,17 +107,17 @@ def test_image_chat_completion_multiple_images(client_with_models, vision_model_
            ],
        },
    ]
-    response = client_with_models.inference.chat_completion(
-        model_id=vision_model_id,
+    response = client_with_models.chat.completions.create(
+        model=vision_model_id,
        messages=messages,
        stream=stream,
    )
    if stream:
        message_content = ""
        for chunk in response:
-            message_content += chunk.event.delta.text
+            message_content += chunk.choices[0].delta.content
    else:
-        message_content = response.completion_message.content
+        message_content = response.choices[0].message.content
    assert len(message_content) > 0
    assert any(expected in message_content.lower().strip() for expected in {"bedroom"}), message_content

@ -125,17 +133,17 @@ def test_image_chat_completion_multiple_images(client_with_models, vision_model_
            "role": "user",
            "content": [
                {
-                    "type": "image",
-                    "image": {
-                        "data": multi_image_data[2],
+                    "type": "image_url",
+                    "image_url": {
+                        "url": multi_image_data[2],
                    },
                },
                {"type": "text", "text": "How about this one?"},
            ],
        },
    )
-    response = client_with_models.inference.chat_completion(
-        model_id=vision_model_id,
+    response = client_with_models.chat.completions.create(
+        model=vision_model_id,
        messages=messages,
        stream=stream,
    )
@ -144,7 +152,7 @@ def test_image_chat_completion_multiple_images(client_with_models, vision_model_
        for chunk in response:
            message_content += chunk.event.delta.text
    else:
-        message_content = response.completion_message.content
+        message_content = response.choices[0].message.content
    assert len(message_content) > 0
    assert any(expected in message_content.lower().strip() for expected in {"sword", "shield"}), message_content

@ -154,11 +162,9 @@ def test_image_chat_completion_streaming(client_with_models, vision_model_id):
        "role": "user",
        "content": [
            {
-                "type": "image",
-                "image": {
-                    "url": {
-                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
-                    },
+                "type": "image_url",
+                "image_url": {
+                    "url": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
                },
            },
            {
@ -167,23 +173,23 @@ def test_image_chat_completion_streaming(client_with_models, vision_model_id):
            },
        ],
    }
-    response = client_with_models.inference.chat_completion(
-        model_id=vision_model_id,
+    response = client_with_models.chat.completions.create(
+        model=vision_model_id,
        messages=[message],
        stream=True,
    )
    streamed_content = ""
    for chunk in response:
-        streamed_content += chunk.event.delta.text.lower()
+        streamed_content += chunk.choices[0].delta.content.lower()
    assert len(streamed_content) > 0
    assert any(expected in streamed_content for expected in {"dog", "puppy", "pup"})


-def test_image_chat_completion_base64(client_with_models, vision_model_id, base64_image_data):
+def test_image_chat_completion_base64(client_with_models, vision_model_id, base64_image_url):
    image_spec = {
-        "type": "image",
-        "image": {
-            "data": base64_image_data,
+        "type": "image_url",
+        "image_url": {
+            "url": base64_image_url,
        },
    }

@ -197,10 +203,10 @@ def test_image_chat_completion_base64(client_with_models, vision_model_id, base6
            },
        ],
    }
-    response = client_with_models.inference.chat_completion(
-        model_id=vision_model_id,
+    response = client_with_models.chat.completions.create(
+        model=vision_model_id,
        messages=[message],
        stream=False,
    )
-    message_content = response.completion_message.content.lower().strip()
+    message_content = response.choices[0].message.content.lower().strip()
    assert len(message_content) > 0
--- a/tests/integration/providers/nvidia/test_datastore.py
+++ b/tests/integration/providers/nvidia/test_datastore.py
@ -14,6 +14,13 @@ from . import skip_in_github_actions
 # LLAMA_STACK_CONFIG="nvidia" pytest -v tests/integration/providers/nvidia/test_datastore.py


+@pytest.fixture(autouse=True)
+def skip_if_no_nvidia_provider(llama_stack_client):
+    provider_types = {p.provider_type for p in llama_stack_client.providers.list() if p.api == "datasetio"}
+    if "remote::nvidia" not in provider_types:
+        pytest.skip("datasetio=remote::nvidia provider not configured, skipping")
+
+
 # nvidia provider only
@skip_in_github_actions
@pytest.mark.parametrize(
--- a/tests/integration/recordings/responses/168daab89068.json
+++ b/tests/integration/recordings/responses/168daab89068.json
@ -0,0 +1,167 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/generate",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "raw": true,
+      "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant. You have access to functions, but you should only use them if they are required.\nYou are an expert in composing functions. You are given a question and a set of possible functions.\nBased on the question, you may or may not need to make one function/tool call to achieve the purpose.\n\nIf you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]\nIf you decide to invoke a function, you SHOULD NOT include any other text in the response. besides the function call in the above format.\nFor a boolean parameter, be sure to use `True` or `False` (capitalized) for the value.\n\n\nHere is a list of functions in JSON format that you can invoke.\n\n[\n    {\n        \"name\": \"greet_everyone\",\n        \"description\": \"\",\n        \"parameters\": {\n            \"type\": \"dict\",\n            \"required\": [\"url\"],\n            \"properties\": {\n                \"url\": {\n                    \"type\": \"string\",\n                    \"description\": \"\"\n                }\n            }\n        }\n    },\n    {\n        \"name\": \"get_boiling_point\",\n        \"description\": \"\n        Returns the boiling point of a liquid in Celsius or Fahrenheit.\n\n        :param liquid_name: The name of the liquid\n        :param celsius: Whether to return the boiling point in Celsius\n        :return: The boiling point of the liquid in Celcius or Fahrenheit\n        \",\n        \"parameters\": {\n            \"type\": \"dict\",\n            \"required\": [\"liquid_name\"],\n            \"properties\": {\n                \"liquid_name\": {\n                    \"type\": \"string\",\n                    \"description\": \"\"\n                },\n                \"celsius\": {\n                    \"type\": \"boolean\",\n                    \"description\": \"\",\n                    \"default\": \"True\"\n                }\n            }\n        }\n    }\n]\n\nYou can answer general questions or invoke tools when necessary.\nIn addition to tool calls, you should also augment your responses by using the tool outputs.\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSay hi to the world. Use tools to do so.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n[greet_everyone(url=\"world\")]<|eot_id|><|start_header_id|>ipython<|end_header_id|>\n\nHello, world!<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+      "options": {
+        "temperature": 0.0
+      },
+      "stream": true
+    },
+    "endpoint": "/api/generate",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:56.663224Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "How",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:56.706706Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": " can",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:56.751075Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": " I",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:56.794187Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": " assist",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:56.837831Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": " you",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:56.879926Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": " further",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:56.92182Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "?",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:56.963339Z",
+          "done": true,
+          "done_reason": "stop",
+          "total_duration": 492973041,
+          "load_duration": 103979375,
+          "prompt_eval_count": 482,
+          "prompt_eval_duration": 87032041,
+          "eval_count": 8,
+          "eval_duration": 300586375,
+          "response": "",
+          "thinking": null,
+          "context": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/tests/integration/recordings/responses/1c0a34fa2e0c.json
+++ b/tests/integration/recordings/responses/1c0a34fa2e0c.json
@ -0,0 +1,31 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.fireworks.ai/inference/v1/v1/embeddings",
+    "headers": {},
+    "body": {
+      "model": "accounts/fireworks/models/qwen3-embedding-8b",
+      "input": [],
+      "encoding_format": "float"
+    },
+    "endpoint": "/v1/embeddings",
+    "model": "accounts/fireworks/models/qwen3-embedding-8b"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.create_embedding_response.CreateEmbeddingResponse",
+      "__data__": {
+        "data": [],
+        "model": "accounts/fireworks/models/qwen3-embedding-8b",
+        "object": "list",
+        "usage": {
+          "prompt_tokens": 0,
+          "total_tokens": 0,
+          "completion_tokens": 0
+        },
+        "perf_metrics": null
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/235c36771a8a.json
+++ b/tests/integration/recordings/responses/235c36771a8a.json
--- a/tests/integration/recordings/responses/239f4768f5aa.json
+++ b/tests/integration/recordings/responses/239f4768f5aa.json
@ -0,0 +1,89 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "system",
+          "content": "You are a helpful assistant. Michael Jordan was born in 1963. He played basketball for the Chicago Bulls for 15 seasons."
+        },
+        {
+          "role": "user",
+          "content": "Please give me information about Michael Jordan."
+        }
+      ],
+      "response_format": {
+        "type": "json_schema",
+        "json_schema": {
+          "name": "AnswerFormat",
+          "schema": {
+            "properties": {
+              "first_name": {
+                "title": "First Name",
+                "type": "string"
+              },
+              "last_name": {
+                "title": "Last Name",
+                "type": "string"
+              },
+              "year_of_birth": {
+                "title": "Year Of Birth",
+                "type": "integer"
+              }
+            },
+            "required": [
+              "first_name",
+              "last_name",
+              "year_of_birth"
+            ],
+            "title": "AnswerFormat",
+            "type": "object"
+          }
+        }
+      },
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-433",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "{\"first_name\": \"Michael\", \"last_name\": \"Jordan\", \"year_of_birth\": 1963}\n\n   \t\t\t\t\t\t\t\t\t\t\t \t\t   ",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1758979490,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 31,
+          "prompt_tokens": 60,
+          "total_tokens": 91,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/24e106063719.json
+++ b/tests/integration/recordings/responses/24e106063719.json
@ -0,0 +1,31 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.fireworks.ai/inference/v1/v1/embeddings",
+    "headers": {},
+    "body": {
+      "model": "accounts/fireworks/models/qwen3-embedding-8b",
+      "input": [],
+      "encoding_format": "base64"
+    },
+    "endpoint": "/v1/embeddings",
+    "model": "accounts/fireworks/models/qwen3-embedding-8b"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.create_embedding_response.CreateEmbeddingResponse",
+      "__data__": {
+        "data": [],
+        "model": "accounts/fireworks/models/qwen3-embedding-8b",
+        "object": "list",
+        "usage": {
+          "prompt_tokens": 0,
+          "total_tokens": 0,
+          "completion_tokens": 0
+        },
+        "perf_metrics": null
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/25649d730247.json
+++ b/tests/integration/recordings/responses/25649d730247.json
@ -0,0 +1,316 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.fireworks.ai/inference/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What's the name of the Sun in latin?"
+        }
+      ],
+      "stream": true
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "accounts/fireworks/models/llama-v3p1-8b-instruct"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920389,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
+          "choices": [
+            {
+              "delta": {
+                "content": "The Latin",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920389,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
+          "choices": [
+            {
+              "delta": {
+                "content": " name",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920389,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
+          "choices": [
+            {
+              "delta": {
+                "content": " for",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920389,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
+          "choices": [
+            {
+              "delta": {
+                "content": " the",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920389,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
+          "choices": [
+            {
+              "delta": {
+                "content": " Sun",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920389,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
+          "choices": [
+            {
+              "delta": {
+                "content": " is",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920389,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
+          "choices": [
+            {
+              "delta": {
+                "content": " \"",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920389,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
+          "choices": [
+            {
+              "delta": {
+                "content": "Sol",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920389,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
+          "choices": [
+            {
+              "delta": {
+                "content": "\".",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920389,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920389,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": {
+            "completion_tokens": 11,
+            "prompt_tokens": 20,
+            "total_tokens": 31,
+            "completion_tokens_details": null,
+            "prompt_tokens_details": null
+          }
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/tests/integration/recordings/responses/2a5a4e821bc8.json
+++ b/tests/integration/recordings/responses/2a5a4e821bc8.json
@ -0,0 +1,44 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "prompt": "Hello, world!",
+      "logprobs": false,
+      "stream": false,
+      "extra_body": {}
+    },
+    "endpoint": "/v1/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.completion.Completion",
+      "__data__": {
+        "id": "cmpl-74",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "text": "Hello! How can I assist you today?"
+          }
+        ],
+        "created": 1758975636,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "text_completion",
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 10,
+          "prompt_tokens": 29,
+          "total_tokens": 39,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/2fef6eda9cd7.json
+++ b/tests/integration/recordings/responses/2fef6eda9cd7.json
--- a/tests/integration/recordings/responses/38ea441b5f83.json
+++ b/tests/integration/recordings/responses/38ea441b5f83.json
@ -0,0 +1,92 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "system",
+          "content": "Pretend you are a weather assistant."
+        },
+        {
+          "role": "user",
+          "content": "What's the weather like in San Francisco, CA?"
+        }
+      ],
+      "stream": false,
+      "tool_choice": "auto",
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the current weather",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "location": {
+                  "type": "string",
+                  "description": "The city and state (both required), e.g. San Francisco, CA."
+                }
+              },
+              "required": [
+                "location"
+              ]
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-761",
+        "choices": [
+          {
+            "finish_reason": "tool_calls",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": [
+                {
+                  "id": "call_cj8ownwc",
+                  "function": {
+                    "arguments": "{\"location\":\"San Francisco, CA\"}",
+                    "name": "get_weather"
+                  },
+                  "type": "function",
+                  "index": 0
+                }
+              ]
+            }
+          }
+        ],
+        "created": 1758975113,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 18,
+          "prompt_tokens": 185,
+          "total_tokens": 203,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/5b2088233334.json
+++ b/tests/integration/recordings/responses/5b2088233334.json
@ -0,0 +1,44 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "prompt": "Hello, world!",
+      "logprobs": true,
+      "stream": false,
+      "extra_body": {}
+    },
+    "endpoint": "/v1/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.completion.Completion",
+      "__data__": {
+        "id": "cmpl-809",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "text": "Hello! It's nice to meet you. Is there anything I can help you with or would you like to chat?"
+          }
+        ],
+        "created": 1758975633,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "text_completion",
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 25,
+          "prompt_tokens": 29,
+          "total_tokens": 54,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/651af76045af.json
+++ b/tests/integration/recordings/responses/651af76045af.json
@ -0,0 +1,550 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.fireworks.ai/inference/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What is the name of the US captial?"
+        }
+      ],
+      "stream": true
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "accounts/fireworks/models/llama-v3p1-8b-instruct"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": "The name",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": " of",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": " the",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": " US",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": " capital",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": " is",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": " Washington",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": " D",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": ".C",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": ".",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": " (",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": "short",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": " for",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": " District",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": " of",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": " Columbia",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": ").",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920398,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": {
+            "completion_tokens": 20,
+            "prompt_tokens": 20,
+            "total_tokens": 40,
+            "completion_tokens_details": null,
+            "prompt_tokens_details": null
+          }
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/tests/integration/recordings/responses/65c12de0a1db.json
+++ b/tests/integration/recordings/responses/65c12de0a1db.json
@ -0,0 +1,60 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "system",
+          "content": "Pretend you are a weather assistant."
+        },
+        {
+          "role": "user",
+          "content": "What's the weather like in San Francisco, CA?"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-123",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "Hello! As of my knowledge cutoff on December 15th, I have the latest information for you. However, please note that my data may not be entirely up-to-date.\n\nCurrently, and based on historical climate patterns, it appears to be a partly cloudy day with mild temperatures in San Francisco, CA. Expect a temperature range of around 48\u00b0F (9\u00b0C) to 54\u00b0F (12\u00b0C). It's likely to be a breezy day, with winds blowing at about 13 mph (21 km/h).\n\nHowever, if I were to look into more recent weather patterns or forecasts, I would recommend checking the latest conditions directly from reliable sources such as the National Weather Service or local news outlets for more accurate and up-to-date information.\n\nPlease let me know how I can further assist you.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1758978071,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 163,
+          "prompt_tokens": 45,
+          "total_tokens": 208,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/7eace23f03df.json
+++ b/tests/integration/recordings/responses/7eace23f03df.json
--- a/tests/integration/recordings/responses/88ce59013228.json
+++ b/tests/integration/recordings/responses/88ce59013228.json
@ -0,0 +1,56 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.fireworks.ai/inference/v1/v1/embeddings",
+    "headers": {},
+    "body": {
+      "model": "accounts/fireworks/models/qwen3-embedding-8b",
+      "input": "Test dimensions parameter",
+      "encoding_format": "float",
+      "dimensions": 16
+    },
+    "endpoint": "/v1/embeddings",
+    "model": "accounts/fireworks/models/qwen3-embedding-8b"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.create_embedding_response.CreateEmbeddingResponse",
+      "__data__": {
+        "data": [
+          {
+            "embedding": [
+              -0.9296875,
+              5.1875,
+              -2.140625,
+              0.171875,
+              -2.25,
+              -0.8359375,
+              -0.828125,
+              1.15625,
+              2.328125,
+              -1.0078125,
+              -3.0,
+              4.09375,
+              0.8359375,
+              0.1015625,
+              2.015625,
+              -1.0859375
+            ],
+            "index": 0,
+            "object": "embedding",
+            "raw_output": null
+          }
+        ],
+        "model": "accounts/fireworks/models/qwen3-embedding-8b",
+        "object": "list",
+        "usage": {
+          "prompt_tokens": 5,
+          "total_tokens": 5,
+          "completion_tokens": 0
+        },
+        "perf_metrics": null
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/8cdb7e65fcfe.json
+++ b/tests/integration/recordings/responses/8cdb7e65fcfe.json
@ -0,0 +1,39 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.fireworks.ai/inference/v1/v1/embeddings",
+    "headers": {},
+    "body": {
+      "model": "accounts/fireworks/models/qwen3-embedding-8b",
+      "input": "Test dimensions parameter",
+      "encoding_format": "base64",
+      "dimensions": 16
+    },
+    "endpoint": "/v1/embeddings",
+    "model": "accounts/fireworks/models/qwen3-embedding-8b"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.create_embedding_response.CreateEmbeddingResponse",
+      "__data__": {
+        "data": [
+          {
+            "embedding": "AABuvwAApkAAAAnAAAAwPgAAEMAAAFa/AABUvwAAlD8AABVAAACBvwAAQMAAAINAAABWPwAA0D0AAAFAAACLvw==",
+            "index": 0,
+            "object": "embedding",
+            "raw_output": null
+          }
+        ],
+        "model": "accounts/fireworks/models/qwen3-embedding-8b",
+        "object": "list",
+        "usage": {
+          "prompt_tokens": 5,
+          "total_tokens": 5,
+          "completion_tokens": 0
+        },
+        "perf_metrics": null
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/901b5e7db4aa.json
+++ b/tests/integration/recordings/responses/901b5e7db4aa.json
@ -0,0 +1,56 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.fireworks.ai/inference/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Hello, world!"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "accounts/fireworks/models/llama-v3p1-8b-instruct"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "1d64ff81-b7c4-40c6-9509-cca71759da3e",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat?",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1758920401,
+        "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 26,
+          "prompt_tokens": 14,
+          "total_tokens": 40,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/949d3ad16367.json
+++ b/tests/integration/recordings/responses/949d3ad16367.json
@ -0,0 +1,347 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/generate",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "raw": true,
+      "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant. You have access to functions, but you should only use them if they are required.\nYou are an expert in composing functions. You are given a question and a set of possible functions.\nBased on the question, you may or may not need to make one function/tool call to achieve the purpose.\n\nIf you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]\nIf you decide to invoke a function, you SHOULD NOT include any other text in the response. besides the function call in the above format.\nFor a boolean parameter, be sure to use `True` or `False` (capitalized) for the value.\n\n\nHere is a list of functions in JSON format that you can invoke.\n\n[\n    {\n        \"name\": \"greet_everyone\",\n        \"description\": \"\",\n        \"parameters\": {\n            \"type\": \"dict\",\n            \"required\": [\"url\"],\n            \"properties\": {\n                \"url\": {\n                    \"type\": \"string\",\n                    \"description\": \"\"\n                }\n            }\n        }\n    },\n    {\n        \"name\": \"get_boiling_point\",\n        \"description\": \"\n        Returns the boiling point of a liquid in Celsius or Fahrenheit.\n\n        :param liquid_name: The name of the liquid\n        :param celsius: Whether to return the boiling point in Celsius\n        :return: The boiling point of the liquid in Celcius or Fahrenheit\n        \",\n        \"parameters\": {\n            \"type\": \"dict\",\n            \"required\": [\"liquid_name\"],\n            \"properties\": {\n                \"liquid_name\": {\n                    \"type\": \"string\",\n                    \"description\": \"\"\n                },\n                \"celsius\": {\n                    \"type\": \"boolean\",\n                    \"description\": \"\",\n                    \"default\": \"True\"\n                }\n            }\n        }\n    }\n]\n\nYou can answer general questions or invoke tools when necessary.\nIn addition to tool calls, you should also augment your responses by using the tool outputs.\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSay hi to the world. Use tools to do so.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n[greet_everyone(url=\"world\")]<|eot_id|><|start_header_id|>ipython<|end_header_id|>\n\nHello, world!<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHow can I assist you further?<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the boiling point of polyjuice? Use tools to answer.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+      "options": {
+        "temperature": 0.0
+      },
+      "stream": true
+    },
+    "endpoint": "/api/generate",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:57.177453Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "[",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:57.220271Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "get",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:57.261232Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "_bo",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:57.302818Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "iling",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:57.344343Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "_point",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:57.386025Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "(",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:57.42778Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "liquid",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:57.469673Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "_name",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:57.512543Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "='",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:57.554479Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "poly",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:57.597092Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "ju",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:57.639581Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "ice",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:57.683223Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "',",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:57.72556Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": " c",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:57.768012Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "elsius",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:57.8098Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "=True",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:57.851578Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": ")]",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:57.893693Z",
+          "done": true,
+          "done_reason": "stop",
+          "total_duration": 885274541,
+          "load_duration": 99578333,
+          "prompt_eval_count": 514,
+          "prompt_eval_duration": 67915875,
+          "eval_count": 18,
+          "eval_duration": 717086791,
+          "response": "",
+          "thinking": null,
+          "context": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/tests/integration/recordings/responses/969a9a757e0c.json
+++ b/tests/integration/recordings/responses/969a9a757e0c.json
@ -0,0 +1,74 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.fireworks.ai/inference/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What's the weather in Tokyo? Use the get_weather function to get the weather."
+        }
+      ],
+      "stream": false,
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the weather in a given city",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "city": {
+                  "type": "string",
+                  "description": "The city to get the weather for"
+                }
+              }
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "accounts/fireworks/models/llama-v3p1-8b-instruct"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "0fe94e7d-f25b-4843-ba0a-e402e0764830",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "I can\u2019t help with that. If you're looking for current weather information, I recommend checking a weather website or app, such as AccuWeather or Weather.com. Is there anything else I can help you with?",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1758920402,
+        "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 45,
+          "prompt_tokens": 27,
+          "total_tokens": 72,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/9c1a4c5336a7.json
+++ b/tests/integration/recordings/responses/9c1a4c5336a7.json
--- a/tests/integration/recordings/responses/a369881bb3a2.json
+++ b/tests/integration/recordings/responses/a369881bb3a2.json
@ -0,0 +1,55 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Test trace 0"
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-272",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "I'm happy to help you with a test. Since we are in the middle of a text-based conversation, I'll do my best to simulate a simple test tracing process.\n\n**Trace Test Results**\n\nTo perform this test, please follow these steps:\n\n1. Type \"test\" on command mode.\n2. Press Enter.\n\nNow, let's start tracing...\n\nTest Tracing Results:\nTest Case: General Functions\nTest Case Result: PASS\n\nSystem Response:\n\n```\n# System Boot Time: 2023-10-13T14:30:00\n# CPU Temperature: 35\u00b0C\n# Disk Space Available: 80%\n```\n\nNext Steps?\n\nType 'done' to exit the test, or 'run' for more tests.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1758978134,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 152,
+          "prompt_tokens": 29,
+          "total_tokens": 181,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/a46b77ffd494.json
+++ b/tests/integration/recordings/responses/a46b77ffd494.json
@ -0,0 +1,44 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "prompt": "Return the exact same sentence and don't add additional words): Michael Jordan was born in the year of 1963",
+      "stop": "1963",
+      "stream": false,
+      "extra_body": {}
+    },
+    "endpoint": "/v1/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.completion.Completion",
+      "__data__": {
+        "id": "cmpl-183",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "text": "Michael Jordan was born in the year of "
+          }
+        ],
+        "created": 1758978053,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "text_completion",
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 11,
+          "prompt_tokens": 48,
+          "total_tokens": 59,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/aa745b14fe67.json
+++ b/tests/integration/recordings/responses/aa745b14fe67.json
--- a/tests/integration/recordings/responses/c3dbccc5de74.json
+++ b/tests/integration/recordings/responses/c3dbccc5de74.json
@ -0,0 +1,112 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "system",
+          "content": "Pretend you are a weather assistant."
+        },
+        {
+          "role": "user",
+          "content": "What's the weather like in San Francisco, CA?"
+        }
+      ],
+      "stream": true,
+      "tool_choice": "auto",
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the current weather",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "location": {
+                  "type": "string",
+                  "description": "The city and state (both required), e.g. San Francisco, CA."
+                }
+              },
+              "required": [
+                "location"
+              ]
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-634",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": "call_wubm4yax",
+                    "function": {
+                      "arguments": "{\"location\":\"San Francisco, CA\"}",
+                      "name": "get_weather"
+                    },
+                    "type": "function"
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758975115,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-634",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": "tool_calls",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758975115,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/tests/integration/recordings/responses/c8e196049fe4.json
+++ b/tests/integration/recordings/responses/c8e196049fe4.json
@ -0,0 +1,47 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "prompt": "Return the exact same sentence and don't add additional words): Michael Jordan was born in the year of 1963",
+      "stop": [
+        "blathering",
+        "1963"
+      ],
+      "stream": false,
+      "extra_body": {}
+    },
+    "endpoint": "/v1/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.completion.Completion",
+      "__data__": {
+        "id": "cmpl-381",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "text": "Michael Jordan was born in the year of "
+          }
+        ],
+        "created": 1758978056,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "text_completion",
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 11,
+          "prompt_tokens": 48,
+          "total_tokens": 59,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/ca332c91adee.json
+++ b/tests/integration/recordings/responses/ca332c91adee.json
--- a/tests/integration/recordings/responses/cb1099daed49.json
+++ b/tests/integration/recordings/responses/cb1099daed49.json
@ -0,0 +1,55 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Test trace 1"
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-122",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "It appears you're trying to initiate a conversation or test the functionality of this AI system. I'm happy to chat with you!\n\nWould you like to:\nA) Ask me a question on a specific topic\nB) Engage in a conversational dialogue on a topic of your choice\nC) Play a text-based game\nD) Test my language understanding capabilities\n\nPlease respond with the letter of your preferred activity.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1758978142,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 85,
+          "prompt_tokens": 29,
+          "total_tokens": 114,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/d0ac68cbde69.json
+++ b/tests/integration/recordings/responses/d0ac68cbde69.json
@ -13,22 +13,23 @@
      "__data__": {
        "models": [
          {
-            "model": "llama3.2-vision:11b",
-            "name": "llama3.2-vision:11b",
-            "digest": "6f2f9757ae97e8a3f8ea33d6adb2b11d93d9a35bef277cd2c0b1b5af8e8d0b1e",
-            "expires_at": "2025-09-03T11:51:35.966409-07:00",
-            "size": 12401209008,
-            "size_vram": 12401209008,
+            "model": "llama3.2:3b",
+            "name": "llama3.2:3b",
+            "digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
+            "expires_at": "2025-09-27T11:54:56.718552-07:00",
+            "size": 3367856128,
+            "size_vram": 3367856128,
            "details": {
              "parent_model": "",
              "format": "gguf",
-              "family": "mllama",
+              "family": "llama",
              "families": [
-                "mllama"
+                "llama"
              ],
-              "parameter_size": "10.7B",
+              "parameter_size": "3.2B",
              "quantization_level": "Q4_K_M"
-            }
+            },
+            "context_length": 4096
          }
        ]
      }
--- a/tests/integration/recordings/responses/d10fc0f9ac66.json
+++ b/tests/integration/recordings/responses/d10fc0f9ac66.json
--- a/tests/integration/recordings/responses/d45ca9107508.json
+++ b/tests/integration/recordings/responses/d45ca9107508.json
@ -0,0 +1,43 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.fireworks.ai/inference/v1/v1/completions",
+    "headers": {},
+    "body": {
+      "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+      "prompt": "Respond to this question and explain your answer. Complete the sentence using one word: Roses are red, violets are ",
+      "stream": false,
+      "extra_body": {}
+    },
+    "endpoint": "/v1/completions",
+    "model": "accounts/fireworks/models/llama-v3p1-8b-instruct"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.completion.Completion",
+      "__data__": {
+        "id": "1bbb8db5-63e5-40cd-8ffe-59e0e88bf8f0",
+        "choices": [
+          {
+            "finish_reason": "length",
+            "index": 0,
+            "logprobs": null,
+            "text": "4. At the beginning of the year, a woman has $5,000"
+          }
+        ],
+        "created": 1758920353,
+        "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+        "object": "text_completion",
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 16,
+          "prompt_tokens": 25,
+          "total_tokens": 41,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/d927b47032de.json
+++ b/tests/integration/recordings/responses/d927b47032de.json
--- a/tests/integration/recordings/responses/e22f98c05933.json
+++ b/tests/integration/recordings/responses/e22f98c05933.json
--- a/tests/integration/recordings/responses/e4daa5642f6e.json
+++ b/tests/integration/recordings/responses/e4daa5642f6e.json
@ -0,0 +1,56 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.fireworks.ai/inference/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Which planet has rings around it with a name starting with letter S?"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "accounts/fireworks/models/llama-v3p1-8b-instruct"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "26632ea9-3481-419d-bc0d-83c177257bc4",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "There are two planets in our solar system with ring systems that have names starting with the letter S:\n\n1. **Saturn** - Its ring system is one of the most iconic and well-known in our solar system. The rings are made up of ice and rock particles that range in size from tiny dust grains to massive boulders.\n2. **Saturn's moon** - The ring system of **Saturn's moon, Rhea**, is sometimes referred to as a \"ring system\" even though it's much smaller and less prominent than Saturn's. However, it's worth noting that Rhea's ring system is not as well-known as Saturn's.\n\nIf you're looking for a planet with a ring system that starts with the letter S and is not a moon, then the answer is Saturn!",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1758920397,
+        "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 164,
+          "prompt_tokens": 24,
+          "total_tokens": 188,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/e61266e87842.json
+++ b/tests/integration/recordings/responses/e61266e87842.json
@ -0,0 +1,185 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/generate",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "raw": true,
+      "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant. You have access to functions, but you should only use them if they are required.\nYou are an expert in composing functions. You are given a question and a set of possible functions.\nBased on the question, you may or may not need to make one function/tool call to achieve the purpose.\n\nIf you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]\nIf you decide to invoke a function, you SHOULD NOT include any other text in the response. besides the function call in the above format.\nFor a boolean parameter, be sure to use `True` or `False` (capitalized) for the value.\n\n\nHere is a list of functions in JSON format that you can invoke.\n\n[\n    {\n        \"name\": \"greet_everyone\",\n        \"description\": \"\",\n        \"parameters\": {\n            \"type\": \"dict\",\n            \"required\": [\"url\"],\n            \"properties\": {\n                \"url\": {\n                    \"type\": \"string\",\n                    \"description\": \"\"\n                }\n            }\n        }\n    },\n    {\n        \"name\": \"get_boiling_point\",\n        \"description\": \"\n        Returns the boiling point of a liquid in Celsius or Fahrenheit.\n\n        :param liquid_name: The name of the liquid\n        :param celsius: Whether to return the boiling point in Celsius\n        :return: The boiling point of the liquid in Celcius or Fahrenheit\n        \",\n        \"parameters\": {\n            \"type\": \"dict\",\n            \"required\": [\"liquid_name\"],\n            \"properties\": {\n                \"liquid_name\": {\n                    \"type\": \"string\",\n                    \"description\": \"\"\n                },\n                \"celsius\": {\n                    \"type\": \"boolean\",\n                    \"description\": \"\",\n                    \"default\": \"True\"\n                }\n            }\n        }\n    }\n]\n\nYou can answer general questions or invoke tools when necessary.\nIn addition to tool calls, you should also augment your responses by using the tool outputs.\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSay hi to the world. Use tools to do so.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+      "options": {
+        "temperature": 0.0
+      },
+      "stream": true
+    },
+    "endpoint": "/api/generate",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:56.034121Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "[g",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:56.07569Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "reet",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:56.116927Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "_every",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:56.159755Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "one",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:56.201675Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "(url",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:56.243056Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "=\"",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:56.284651Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "world",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:56.326276Z",
+          "done": false,
+          "done_reason": null,
+          "total_duration": null,
+          "load_duration": null,
+          "prompt_eval_count": null,
+          "prompt_eval_duration": null,
+          "eval_count": null,
+          "eval_duration": null,
+          "response": "\")]",
+          "thinking": null,
+          "context": null
+        }
+      },
+      {
+        "__type__": "ollama._types.GenerateResponse",
+        "__data__": {
+          "model": "llama3.2:3b-instruct-fp16",
+          "created_at": "2025-09-27T18:05:56.367959Z",
+          "done": true,
+          "done_reason": "stop",
+          "total_duration": 5381441291,
+          "load_duration": 4112439791,
+          "prompt_eval_count": 459,
+          "prompt_eval_duration": 932587833,
+          "eval_count": 9,
+          "eval_duration": 334328250,
+          "response": "",
+          "thinking": null,
+          "context": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/tests/integration/recordings/responses/e99f14805360.json
+++ b/tests/integration/recordings/responses/e99f14805360.json
@ -0,0 +1,706 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.fireworks.ai/inference/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Hello, world!"
+        }
+      ],
+      "stream": true
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "accounts/fireworks/models/llama-v3p1-8b-instruct"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": "Hello!",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " It",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": "'s",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " nice",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " to",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " meet",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " you",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": ".",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " Is",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " there",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " something",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " I",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " can",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " help",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " you",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " with",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " or",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " would",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " you",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " like",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " to",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": " chat",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": "?",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "d583f66e-de11-4210-8153-54be000a2783",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758920391,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": {
+            "completion_tokens": 26,
+            "prompt_tokens": 14,
+            "total_tokens": 40,
+            "completion_tokens_details": null,
+            "prompt_tokens_details": null
+          }
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/tests/integration/recordings/responses/f3cbd3f07e60.json
+++ b/tests/integration/recordings/responses/f3cbd3f07e60.json
@ -0,0 +1,996 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.fireworks.ai/inference/v1/v1/completions",
+    "headers": {},
+    "body": {
+      "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+      "prompt": "Respond to this question and explain your answer. Complete the sentence using one word: Roses are red, violets are ",
+      "max_tokens": 50,
+      "stream": true,
+      "extra_body": {}
+    },
+    "endpoint": "/v1/completions",
+    "model": "accounts/fireworks/models/llama-v3p1-8b-instruct"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " a"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " type"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " of"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " __________________"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "_____"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": ".\n\n"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "##"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " Step"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " "
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "1"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": ":"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " Identify"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " the"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " type"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " of"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " flower"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " mentioned"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " in"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " the"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " sentence"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": ".\n"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "The"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " sentence"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " mentions"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " \""
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "vio"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "lets"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": ".\"\n\n"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "##"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " Step"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " "
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "2"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": ":"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " Determine"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " the"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " type"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " of"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " flower"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " v"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "io"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "lets"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " are"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": ".\n"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "V"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "io"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "lets"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " are"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " a"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " type"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " of"
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
+          "choices": [
+            {
+              "finish_reason": "length",
+              "index": 0,
+              "logprobs": null,
+              "text": ""
+            }
+          ],
+          "created": 1758920354,
+          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "object": "text_completion",
+          "system_fingerprint": null,
+          "usage": {
+            "completion_tokens": 50,
+            "prompt_tokens": 25,
+            "total_tokens": 75,
+            "completion_tokens_details": null,
+            "prompt_tokens_details": null
+          }
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/tests/integration/recordings/responses/f6469c4656dd.json
+++ b/tests/integration/recordings/responses/f6469c4656dd.json
--- a/tests/integration/recordings/responses/f701ad342bd8.json
+++ b/tests/integration/recordings/responses/f701ad342bd8.json
@ -0,0 +1,56 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.fireworks.ai/inference/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Which planet do humans live on?"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "accounts/fireworks/models/llama-v3p1-8b-instruct"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "0fd60cd7-dc72-45b7-808c-4da91de80093",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "Humans live on a planet called Earth.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1758920388,
+        "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 9,
+          "prompt_tokens": 17,
+          "total_tokens": 26,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/models-7d9446738fd7-d5d684a3.json
+++ b/tests/integration/recordings/responses/models-7d9446738fd7-d5d684a3.json
@ -0,0 +1,527 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.fireworks.ai/inference/v1/v1/models",
+    "headers": {},
+    "body": {},
+    "endpoint": "/v1/models",
+    "model": ""
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/flux-1-dev-fp8",
+          "created": 1729532889,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "FLUMINA_BASE_MODEL",
+          "supports_chat": false,
+          "supports_image_input": false,
+          "supports_tools": false
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/tvergho-87e44d/models/debatecards-70b-ft-3epoch-dpo-v2",
+          "created": 1743381121,
+          "object": "model",
+          "owned_by": "tvergho-87e44d",
+          "kind": "HF_PEFT_ADDON",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": false
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/flux-kontext-max",
+          "created": 1750714611,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "FLUMINA_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": true,
+          "supports_tools": false
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/flux-kontext-pro",
+          "created": 1750488264,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "FLUMINA_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": true,
+          "supports_tools": false
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/sentientfoundation-serverless/models/dobby-mini-unhinged-plus-llama-3-1-8b",
+          "created": 1748467427,
+          "object": "model",
+          "owned_by": "sentientfoundation-serverless",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": false,
+          "context_length": 131072
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/deepseek-v3",
+          "created": 1735576668,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 131072
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/sentientfoundation/models/dobby-unhinged-llama-3-3-70b-new",
+          "created": 1739563474,
+          "object": "model",
+          "owned_by": "sentientfoundation",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": false,
+          "context_length": 131072
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/gpt-oss-120b",
+          "created": 1754345600,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 131072
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/qwen3-coder-480b-a35b-instruct",
+          "created": 1753211090,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 262144
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/qwen3-30b-a3b-thinking-2507",
+          "created": 1753916446,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": false
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
+          "created": 1753124424,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 262144
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/qwen3-235b-a22b-thinking-2507",
+          "created": 1753455434,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": false,
+          "context_length": 262144
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/qwen3-embedding-8b",
+          "created": 1755707090,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": false,
+          "context_length": 40960
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/deepseek-v3-0324",
+          "created": 1742827220,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 163840
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/deepseek-v3p1-terminus",
+          "created": 1758586241,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 163840
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/kimi-k2-instruct",
+          "created": 1752259096,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 131072
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/gpt-oss-20b",
+          "created": 1754345466,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": false,
+          "context_length": 131072
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/llama4-maverick-instruct-basic",
+          "created": 1743878495,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": true,
+          "supports_tools": true,
+          "context_length": 1048576
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/qwen3-coder-30b-a3b-instruct",
+          "created": 1754063588,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": false,
+          "context_length": 262144
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/llama-v3p3-70b-instruct",
+          "created": 1733442103,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": false,
+          "context_length": 131072
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/qwen2p5-vl-32b-instruct",
+          "created": 1743392739,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": true,
+          "supports_tools": false,
+          "context_length": 128000
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/qwen3-235b-a22b",
+          "created": 1745885249,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 131072
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/glm-4p5-air",
+          "created": 1754089426,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 131072
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/deepseek-r1",
+          "created": 1737397673,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": false,
+          "context_length": 163840
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+          "created": 1721692808,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": false,
+          "context_length": 131072
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/deepseek-r1-basic",
+          "created": 1742306746,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": false,
+          "context_length": 163840
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/deepseek-v3p1",
+          "created": 1755758988,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 163840
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/flux-1-schnell-fp8",
+          "created": 1729535376,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "FLUMINA_BASE_MODEL",
+          "supports_chat": false,
+          "supports_image_input": false,
+          "supports_tools": false
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/glm-4p5",
+          "created": 1753809636,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 131072
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/kimi-k2-instruct-0905",
+          "created": 1757018994,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 262144
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/llama-v3p1-405b-instruct",
+          "created": 1721428386,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 131072
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/llama4-scout-instruct-basic",
+          "created": 1743878279,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": true,
+          "supports_tools": true,
+          "context_length": 1048576
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/qwen3-30b-a3b",
+          "created": 1745878133,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 131072
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/llama-v3p1-70b-instruct",
+          "created": 1721287357,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 131072
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/deepseek-r1-0528",
+          "created": 1748456377,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 163840
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/mixtral-8x22b-instruct",
+          "created": 1713375508,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 65536
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/qwen3-30b-a3b-instruct-2507",
+          "created": 1753808388,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": false,
+          "context_length": 262144
+        }
+      }
+    ],
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/models-bd032f995f2a-52e8575f.json
+++ b/tests/integration/recordings/responses/models-bd032f995f2a-52e8575f.json
@ -0,0 +1,834 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.openai.com/v1/v1/models",
+    "headers": {},
+    "body": {},
+    "endpoint": "/v1/models",
+    "model": ""
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-0613",
+          "created": 1686588896,
+          "object": "model",
+          "owned_by": "openai"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4",
+          "created": 1687882411,
+          "object": "model",
+          "owned_by": "openai"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo",
+          "created": 1677610602,
+          "object": "model",
+          "owned_by": "openai"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-codex",
+          "created": 1757527818,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-audio-2025-08-28",
+          "created": 1756256146,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-realtime",
+          "created": 1756271701,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-realtime-2025-08-28",
+          "created": 1756271773,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-audio",
+          "created": 1756339249,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "davinci-002",
+          "created": 1692634301,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "babbage-002",
+          "created": 1692634615,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-instruct",
+          "created": 1692901427,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-instruct-0914",
+          "created": 1694122472,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "dall-e-3",
+          "created": 1698785189,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "dall-e-2",
+          "created": 1698798177,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-1106-preview",
+          "created": 1698957206,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-1106",
+          "created": 1698959748,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "tts-1-hd",
+          "created": 1699046015,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "tts-1-1106",
+          "created": 1699053241,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "tts-1-hd-1106",
+          "created": 1699053533,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "text-embedding-3-small",
+          "created": 1705948997,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "text-embedding-3-large",
+          "created": 1705953180,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-0125-preview",
+          "created": 1706037612,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-turbo-preview",
+          "created": 1706037777,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-0125",
+          "created": 1706048358,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-turbo",
+          "created": 1712361441,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-turbo-2024-04-09",
+          "created": 1712601677,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o",
+          "created": 1715367049,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-2024-05-13",
+          "created": 1715368132,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-2024-07-18",
+          "created": 1721172717,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini",
+          "created": 1721172741,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-2024-08-06",
+          "created": 1722814719,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "chatgpt-4o-latest",
+          "created": 1723515131,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-mini-2024-09-12",
+          "created": 1725648979,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-mini",
+          "created": 1725649008,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-realtime-preview-2024-10-01",
+          "created": 1727131766,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-audio-preview-2024-10-01",
+          "created": 1727389042,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-audio-preview",
+          "created": 1727460443,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-realtime-preview",
+          "created": 1727659998,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "omni-moderation-latest",
+          "created": 1731689265,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "omni-moderation-2024-09-26",
+          "created": 1732734466,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-realtime-preview-2024-12-17",
+          "created": 1733945430,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-audio-preview-2024-12-17",
+          "created": 1734034239,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-realtime-preview-2024-12-17",
+          "created": 1734112601,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-audio-preview-2024-12-17",
+          "created": 1734115920,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-2024-12-17",
+          "created": 1734326976,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1",
+          "created": 1734375816,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-realtime-preview",
+          "created": 1734387380,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-audio-preview",
+          "created": 1734387424,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3-mini",
+          "created": 1737146383,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3-mini-2025-01-31",
+          "created": 1738010200,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-2024-11-20",
+          "created": 1739331543,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-search-preview-2025-03-11",
+          "created": 1741388170,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-search-preview",
+          "created": 1741388720,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-search-preview-2025-03-11",
+          "created": 1741390858,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-search-preview",
+          "created": 1741391161,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-transcribe",
+          "created": 1742068463,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-transcribe",
+          "created": 1742068596,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-pro-2025-03-19",
+          "created": 1742251504,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-pro",
+          "created": 1742251791,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-tts",
+          "created": 1742403959,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3-2025-04-16",
+          "created": 1744133301,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o4-mini-2025-04-16",
+          "created": 1744133506,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3",
+          "created": 1744225308,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o4-mini",
+          "created": 1744225351,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-2025-04-14",
+          "created": 1744315746,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1",
+          "created": 1744316542,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-mini-2025-04-14",
+          "created": 1744317547,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-mini",
+          "created": 1744318173,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-nano-2025-04-14",
+          "created": 1744321025,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-nano",
+          "created": 1744321707,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-image-1",
+          "created": 1745517030,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "codex-mini-latest",
+          "created": 1746673257,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3-pro",
+          "created": 1748475349,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-realtime-preview-2025-06-03",
+          "created": 1748907838,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-audio-preview-2025-06-03",
+          "created": 1748908498,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3-pro-2025-06-10",
+          "created": 1749166761,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o4-mini-deep-research",
+          "created": 1749685485,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3-deep-research",
+          "created": 1749840121,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3-deep-research-2025-06-26",
+          "created": 1750865219,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o4-mini-deep-research-2025-06-26",
+          "created": 1750866121,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-chat-latest",
+          "created": 1754073306,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-2025-08-07",
+          "created": 1754075360,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5",
+          "created": 1754425777,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-mini-2025-08-07",
+          "created": 1754425867,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-mini",
+          "created": 1754425928,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-nano-2025-08-07",
+          "created": 1754426303,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-nano",
+          "created": 1754426384,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-16k",
+          "created": 1683758102,
+          "object": "model",
+          "owned_by": "openai-internal"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "tts-1",
+          "created": 1681940951,
+          "object": "model",
+          "owned_by": "openai-internal"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "whisper-1",
+          "created": 1677532384,
+          "object": "model",
+          "owned_by": "openai-internal"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "text-embedding-ada-002",
+          "created": 1671217299,
+          "object": "model",
+          "owned_by": "openai-internal"
+        }
+      }
+    ],
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/models-bd032f995f2a-e660ee4a.json
+++ b/tests/integration/recordings/responses/models-bd032f995f2a-e660ee4a.json
@ -0,0 +1,96 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/models",
+    "headers": {},
+    "body": {},
+    "endpoint": "/v1/models",
+    "model": ""
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "nomic-embed-text:latest",
+          "created": 1756922046,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "all-minilm:l6-v2",
+          "created": 1756919946,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama3.2-vision:11b",
+          "created": 1753926302,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama3.2-vision:latest",
+          "created": 1753845527,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama-guard3:1b",
+          "created": 1753479584,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama3.2:1b",
+          "created": 1752814944,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "all-minilm:latest",
+          "created": 1748994610,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama3.2:3b",
+          "created": 1746123323,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama3.2:3b-instruct-fp16",
+          "created": 1746052428,
+          "object": "model",
+          "owned_by": "library"
+        }
+      }
+    ],
+    "is_streaming": false
+  }
+}
--- a/tests/integration/suites.py
+++ b/tests/integration/suites.py
@ -127,9 +127,8 @@ SETUP_DEFINITIONS: dict[str, Setup] = {
        name="fireworks",
        description="Fireworks provider with a text model",
        defaults={
-            "text_model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
-            "vision_model": "accounts/fireworks/models/llama-v3p2-90b-vision-instruct",
-            "embedding_model": "nomic-ai/nomic-embed-text-v1.5",
+            "text_model": "fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct",
+            "embedding_model": "fireworks/accounts/fireworks/models/qwen3-embedding-8b",
        },
    ),
 }
--- a/tests/integration/telemetry/test_telemetry.py
+++ b/tests/integration/telemetry/test_telemetry.py
@ -32,8 +32,8 @@ def setup_telemetry_data(llama_stack_client, text_model_id):
        )

    for i in range(2):
-        llama_stack_client.inference.chat_completion(
-            model_id=text_model_id, messages=[{"role": "user", "content": f"Test trace {i}"}]
+        llama_stack_client.chat.completions.create(
+            model=text_model_id, messages=[{"role": "user", "content": f"Test trace {i}"}]
        )

    start_time = time.time()
--- a/tests/integration/test_cases/inference/chat_completion.json
+++ b/tests/integration/test_cases/inference/chat_completion.json
@ -83,12 +83,19 @@
      ],
      "tools": [
        {
-          "tool_name": "get_weather",
-          "description": "Get the current weather",
-          "parameters": {
-            "location": {
-              "param_type": "string",
-              "description": "The city and state (both required), e.g. San Francisco, CA."
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the current weather",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "location": {
+                  "type": "string",
+                  "description": "The city and state (both required), e.g. San Francisco, CA."
+                }
+              },
+              "required": ["location"]
            }
          }
        }
@ -116,12 +123,19 @@
      ],
      "tools": [
        {
-          "tool_name": "get_weather",
-          "description": "Get the current weather",
-          "parameters": {
-            "location": {
-              "param_type": "string",
-              "description": "The city and state (both required), e.g. San Francisco, CA."
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the current weather",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "location": {
+                  "type": "string",
+                  "description": "The city and state (both required), e.g. San Francisco, CA."
+                }
+              },
+              "required": ["location"]
            }
          }
        }
@ -162,12 +176,19 @@
      ],
      "tools": [
        {
-          "tool_name": "get_weather",
-          "description": "Get the current weather",
-          "parameters": {
-            "location": {
-              "param_type": "string",
-              "description": "The city and state (both required), e.g. San Francisco, CA."
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the current weather",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "location": {
+                  "type": "string",
+                  "description": "The city and state (both required), e.g. San Francisco, CA."
+                }
+              },
+              "required": ["location"]
            }
          }
        }
@ -192,66 +213,6 @@
      ]
    }
  },
-  "array_parameter": {
-    "data": {
-      "messages": [
-        [
-          {
-            "role": "user",
-            "content": "Please add a new product with name 'Widget', price 19.99, in stock, and tags ['new', 'sale'] and give me the product id."
-          }
-        ]
-      ],
-      "tools": [
-        {
-          "tool_name": "addProduct",
-          "description": "Get the current weather",
-          "parameters": {
-            "name": {
-              "param_type": "string",
-              "description": "Name of the product"
-            },
-            "price": {
-              "param_type": "number",
-              "description": "Price of the product"
-            },
-            "inStock": {
-              "param_type": "boolean",
-              "description": "Availability status of the product."
-            },
-            "tags": {
-              "param_type": "list[str]",
-              "description": "List of product tags"
-            }
-          }
-        }
-      ],
-      "tool_responses": [
-        {
-          "response": "{'response': 'Successfully added product with id: 123'}"
-        }
-      ],
-      "expected": [
-        {
-          "num_tool_calls": 1,
-          "tool_name": "addProduct",
-          "tool_arguments": {
-            "name": "Widget",
-            "price": 19.99,
-            "inStock": true,
-            "tags": [
-              "new",
-              "sale"
-            ]
-          }
-        },
-        {
-          "num_tool_calls": 0,
-          "answer": "123"
-        }
-      ]
-    }
-  },
  "sample_messages_tool_calling": {
    "data": {
      "messages": [
@ -270,13 +231,19 @@
      ],
      "tools": [
        {
-          "tool_name": "get_weather",
-          "description": "Get the current weather",
-          "parameters": {
-            "location": {
-              "param_type": "string",
-              "description": "The city and state, e.g. San Francisco, CA",
-              "required": true
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the current weather",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "location": {
+                  "type": "string",
+                  "description": "The city and state (both required), e.g. San Francisco, CA."
+                }
+              },
+              "required": ["location"]
            }
          }
        }
@ -343,18 +310,23 @@
      ],
      "tools": [
        {
-          "tool_name": "get_object_namespace_list",
-          "description": "Get the list of objects in a namespace",
-          "parameters": {
-            "kind": {
-              "param_type": "string",
-              "description": "the type of object",
-              "required": true
-            },
-            "namespace": {
-              "param_type": "string",
-              "description": "the name of the namespace",
-              "required": true
+          "type": "function",
+          "function": {
+            "name": "get_object_namespace_list",
+            "description": "Get the list of objects in a namespace",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "kind": {
+                  "type": "string",
+                  "description": "the type of object"
+                },
+                "namespace": {
+                  "type": "string",
+                  "description": "the name of the namespace"
+                }
+              },
+              "required": ["kind", "namespace"]
            }
          }
        }
--- a/tests/integration/tool_runtime/test_mcp.py
+++ b/tests/integration/tool_runtime/test_mcp.py
@ -31,6 +31,11 @@ def test_mcp_invocation(llama_stack_client, text_model_id, mcp_server):
    uri = mcp_server["server_url"]

    # registering should not raise an error anymore even if you don't specify the auth token
+    try:
+        llama_stack_client.toolgroups.unregister(toolgroup_id=test_toolgroup_id)
+    except Exception:
+        pass
+
    llama_stack_client.toolgroups.register(
        toolgroup_id=test_toolgroup_id,
        provider_id="model-context-protocol",
--- a/tests/unit/providers/agent/test_get_raw_document_text.py
+++ b/tests/unit/providers/agent/test_get_raw_document_text.py
@ -107,14 +107,34 @@ async def test_get_raw_document_text_deprecated_text_yaml_with_text_content_item
        assert "text/yaml" in str(w[0].message)


+async def test_get_raw_document_text_supports_json_mime_type():
+    """Test that the function accepts application/json mime type."""
+    json_content = '{"name": "test", "version": "1.0", "items": ["item1", "item2"]}'
+
+    document = Document(content=json_content, mime_type="application/json")
+
+    result = await get_raw_document_text(document)
+    assert result == json_content
+
+
+async def test_get_raw_document_text_with_json_text_content_item():
+    """Test that the function handles JSON TextContentItem correctly."""
+    json_content = '{"key": "value", "nested": {"array": [1, 2, 3]}}'
+
+    document = Document(content=TextContentItem(text=json_content), mime_type="application/json")
+
+    result = await get_raw_document_text(document)
+    assert result == json_content
+
+
 async def test_get_raw_document_text_rejects_unsupported_mime_types():
    """Test that the function rejects unsupported mime types."""
    document = Document(
        content="Some content",
-        mime_type="application/json",  # Not supported
+        mime_type="application/pdf",  # Not supported
    )

-    with pytest.raises(ValueError, match="Unexpected document mime type: application/json"):
+    with pytest.raises(ValueError, match="Unexpected document mime type: application/pdf"):
        await get_raw_document_text(document)


--- a/tests/unit/providers/agent/test_meta_reference_agent.py
+++ b/tests/unit/providers/agent/test_meta_reference_agent.py
@ -16,9 +16,11 @@ from llama_stack.apis.agents import (
 )
 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.inference import Inference
+from llama_stack.apis.resource import ResourceType
 from llama_stack.apis.safety import Safety
-from llama_stack.apis.tools import ToolGroups, ToolRuntime
+from llama_stack.apis.tools import ListToolsResponse, Tool, ToolGroups, ToolParameter, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
+from llama_stack.providers.inline.agents.meta_reference.agent_instance import ChatAgent
 from llama_stack.providers.inline.agents.meta_reference.agents import MetaReferenceAgentsImpl
 from llama_stack.providers.inline.agents.meta_reference.config import MetaReferenceAgentsImplConfig
 from llama_stack.providers.inline.agents.meta_reference.persistence import AgentInfo
@ -75,11 +77,11 @@ def sample_agent_config():
        },
        input_shields=["string"],
        output_shields=["string"],
-        toolgroups=["string"],
+        toolgroups=["mcp::my_mcp_server"],
        client_tools=[
            {
-                "name": "string",
-                "description": "string",
+                "name": "client_tool",
+                "description": "Client Tool",
                "parameters": [
                    {
                        "name": "string",
@ -226,3 +228,83 @@ async def test_delete_agent(agents_impl, sample_agent_config):
    # Verify the agent was deleted
    with pytest.raises(ValueError):
        await agents_impl.get_agent(agent_id)
+
+
+async def test__initialize_tools(agents_impl, sample_agent_config):
+    # Mock tool_groups_api.list_tools()
+    agents_impl.tool_groups_api.list_tools.return_value = ListToolsResponse(
+        data=[
+            Tool(
+                identifier="story_maker",
+                provider_id="model-context-protocol",
+                type=ResourceType.tool,
+                toolgroup_id="mcp::my_mcp_server",
+                description="Make a story",
+                parameters=[
+                    ToolParameter(
+                        name="story_title",
+                        parameter_type="string",
+                        description="Title of the story",
+                        required=True,
+                        title="Story Title",
+                    ),
+                    ToolParameter(
+                        name="input_words",
+                        parameter_type="array",
+                        description="Input words",
+                        required=False,
+                        items={"type": "string"},
+                        title="Input Words",
+                        default=[],
+                    ),
+                ],
+            )
+        ]
+    )
+
+    create_response = await agents_impl.create_agent(sample_agent_config)
+    agent_id = create_response.agent_id
+
+    # Get an instance of ChatAgent
+    chat_agent = await agents_impl._get_agent_impl(agent_id)
+    assert chat_agent is not None
+    assert isinstance(chat_agent, ChatAgent)
+
+    # Initialize tool definitions
+    await chat_agent._initialize_tools()
+    assert len(chat_agent.tool_defs) == 2
+
+    # Verify the first tool, which is a client tool
+    first_tool = chat_agent.tool_defs[0]
+    assert first_tool.tool_name == "client_tool"
+    assert first_tool.description == "Client Tool"
+
+    # Verify the second tool, which is an MCP tool that has an array-type property
+    second_tool = chat_agent.tool_defs[1]
+    assert second_tool.tool_name == "story_maker"
+    assert second_tool.description == "Make a story"
+
+    parameters = second_tool.parameters
+    assert len(parameters) == 2
+
+    # Verify a string property
+    story_title = parameters.get("story_title")
+    assert story_title is not None
+    assert story_title.param_type == "string"
+    assert story_title.description == "Title of the story"
+    assert story_title.required
+    assert story_title.items is None
+    assert story_title.title == "Story Title"
+    assert story_title.default is None
+
+    # Verify an array property
+    input_words = parameters.get("input_words")
+    assert input_words is not None
+    assert input_words.param_type == "array"
+    assert input_words.description == "Input words"
+    assert not input_words.required
+    assert input_words.items is not None
+    assert len(input_words.items) == 1
+    assert input_words.items.get("type") == "string"
+    assert input_words.title == "Input Words"
+    assert input_words.default == []
--- a/tests/unit/providers/agents/meta_reference/responses/init.py
+++ b/tests/unit/providers/agents/meta_reference/responses/init.py
@ -2,6 +2,4 @@
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .batch_inference import *
+# the root directory of this source tree.
--- a/tests/unit/providers/agents/meta_reference/responses/test_streaming.py
+++ b/tests/unit/providers/agents/meta_reference/responses/test_streaming.py
@ -0,0 +1,147 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+Unit tests for MCP tool parameter conversion in streaming responses.
+
+This tests the fix for handling array-type parameters with 'items' field
+when converting MCP tool definitions to OpenAI format.
+"""
+
+from llama_stack.apis.tools import ToolDef, ToolParameter
+from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
+from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
+
+
+def test_mcp_tool_conversion_with_array_items():
+    """
+    Test that MCP tool parameters with array type and items field are properly converted.
+
+    This is a regression test for the bug where array parameters without 'items'
+    caused OpenAI API validation errors like:
+    "Invalid schema for function 'pods_exec': In context=('properties', 'command'),
+    array schema missing items."
+    """
+    # Create a tool parameter with array type and items specification
+    # This mimics what kubernetes-mcp-server's pods_exec tool has
+    tool_param = ToolParameter(
+        name="command",
+        parameter_type="array",
+        description="Command to execute in the pod",
+        required=True,
+        items={"type": "string"},  # This is the crucial field
+    )
+
+    # Convert to ToolDefinition format (as done in streaming.py)
+    tool_def = ToolDefinition(
+        tool_name="test_tool",
+        description="Test tool with array parameter",
+        parameters={
+            "command": ToolParamDefinition(
+                param_type=tool_param.parameter_type,
+                description=tool_param.description,
+                required=tool_param.required,
+                default=tool_param.default,
+                items=tool_param.items,  # The fix: ensure items is passed through
+            )
+        },
+    )
+
+    # Convert to OpenAI format
+    openai_tool = convert_tooldef_to_openai_tool(tool_def)
+
+    # Verify the conversion includes the items field
+    assert openai_tool["type"] == "function"
+    assert openai_tool["function"]["name"] == "test_tool"
+    assert "parameters" in openai_tool["function"]
+
+    parameters = openai_tool["function"]["parameters"]
+    assert "properties" in parameters
+    assert "command" in parameters["properties"]
+
+    command_param = parameters["properties"]["command"]
+    assert command_param["type"] == "array"
+    assert "items" in command_param, "Array parameter must have 'items' field for OpenAI API"
+    assert command_param["items"] == {"type": "string"}
+
+
+def test_mcp_tool_conversion_without_array():
+    """Test that non-array parameters work correctly without items field."""
+    tool_param = ToolParameter(
+        name="name",
+        parameter_type="string",
+        description="Name parameter",
+        required=True,
+    )
+
+    tool_def = ToolDefinition(
+        tool_name="test_tool",
+        description="Test tool with string parameter",
+        parameters={
+            "name": ToolParamDefinition(
+                param_type=tool_param.parameter_type,
+                description=tool_param.description,
+                required=tool_param.required,
+                items=tool_param.items,  # Will be None for non-array types
+            )
+        },
+    )
+
+    openai_tool = convert_tooldef_to_openai_tool(tool_def)
+
+    # Verify basic structure
+    assert openai_tool["type"] == "function"
+    parameters = openai_tool["function"]["parameters"]
+    assert "name" in parameters["properties"]
+
+    name_param = parameters["properties"]["name"]
+    assert name_param["type"] == "string"
+    # items should not be present for non-array types
+    assert "items" not in name_param or name_param.get("items") is None
+
+
+def test_mcp_tool_conversion_complex_array_items():
+    """Test array parameter with complex items schema (object type)."""
+    tool_param = ToolParameter(
+        name="configs",
+        parameter_type="array",
+        description="Array of configuration objects",
+        required=False,
+        items={
+            "type": "object",
+            "properties": {
+                "key": {"type": "string"},
+                "value": {"type": "string"},
+            },
+            "required": ["key"],
+        },
+    )
+
+    tool_def = ToolDefinition(
+        tool_name="test_tool",
+        description="Test tool with complex array parameter",
+        parameters={
+            "configs": ToolParamDefinition(
+                param_type=tool_param.parameter_type,
+                description=tool_param.description,
+                required=tool_param.required,
+                items=tool_param.items,
+            )
+        },
+    )
+
+    openai_tool = convert_tooldef_to_openai_tool(tool_def)
+
+    # Verify complex items schema is preserved
+    parameters = openai_tool["function"]["parameters"]
+    configs_param = parameters["properties"]["configs"]
+
+    assert configs_param["type"] == "array"
+    assert "items" in configs_param
+    assert configs_param["items"]["type"] == "object"
+    assert "properties" in configs_param["items"]
+    assert "key" in configs_param["items"]["properties"]
+    assert "value" in configs_param["items"]["properties"]
--- a/tests/unit/providers/utils/inference/test_openai_mixin.py
+++ b/tests/unit/providers/utils/inference/test_openai_mixin.py
@ -4,11 +4,11 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from unittest.mock import MagicMock, PropertyMock, patch
+from unittest.mock import AsyncMock, MagicMock, PropertyMock, patch

 import pytest

-from llama_stack.apis.inference import Model
+from llama_stack.apis.inference import Model, OpenAIUserMessageParam
 from llama_stack.apis.models import ModelType
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

@ -43,8 +43,17 @@ class OpenAIMixinWithEmbeddingsImpl(OpenAIMixin):

@pytest.fixture
 def mixin():
-    """Create a test instance of OpenAIMixin"""
-    return OpenAIMixinImpl()
+    """Create a test instance of OpenAIMixin with mocked model_store"""
+    mixin_instance = OpenAIMixinImpl()
+
+    # just enough to satisfy _get_provider_model_id calls
+    mock_model_store = MagicMock()
+    mock_model = MagicMock()
+    mock_model.provider_resource_id = "test-provider-resource-id"
+    mock_model_store.get_model = AsyncMock(return_value=mock_model)
+    mixin_instance.model_store = mock_model_store
+
+    return mixin_instance


@pytest.fixture
@ -205,6 +214,74 @@ class TestOpenAIMixinCacheBehavior:
            assert "final-mock-model-id" in mixin._model_cache


+class TestOpenAIMixinImagePreprocessing:
+    """Test cases for image preprocessing functionality"""
+
+    async def test_openai_chat_completion_with_image_preprocessing_enabled(self, mixin):
+        """Test that image URLs are converted to base64 when download_images is True"""
+        mixin.download_images = True
+
+        message = OpenAIUserMessageParam(
+            role="user",
+            content=[
+                {"type": "text", "text": "What's in this image?"},
+                {"type": "image_url", "image_url": {"url": "http://example.com/image.jpg"}},
+            ],
+        )
+
+        mock_client = MagicMock()
+        mock_response = MagicMock()
+        mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
+
+        with patch.object(type(mixin), "client", new_callable=PropertyMock, return_value=mock_client):
+            with patch("llama_stack.providers.utils.inference.openai_mixin.localize_image_content") as mock_localize:
+                mock_localize.return_value = (b"fake_image_data", "jpeg")
+
+                await mixin.openai_chat_completion(model="test-model", messages=[message])
+
+            mock_localize.assert_called_once_with("http://example.com/image.jpg")
+
+            mock_client.chat.completions.create.assert_called_once()
+            call_args = mock_client.chat.completions.create.call_args
+            processed_messages = call_args[1]["messages"]
+            assert len(processed_messages) == 1
+            content = processed_messages[0]["content"]
+            assert len(content) == 2
+            assert content[0]["type"] == "text"
+            assert content[1]["type"] == "image_url"
+            assert content[1]["image_url"]["url"] == "data:image/jpeg;base64,ZmFrZV9pbWFnZV9kYXRh"
+
+    async def test_openai_chat_completion_with_image_preprocessing_disabled(self, mixin):
+        """Test that image URLs are not modified when download_images is False"""
+        mixin.download_images = False  # explicitly set to False
+
+        message = OpenAIUserMessageParam(
+            role="user",
+            content=[
+                {"type": "text", "text": "What's in this image?"},
+                {"type": "image_url", "image_url": {"url": "http://example.com/image.jpg"}},
+            ],
+        )
+
+        mock_client = MagicMock()
+        mock_response = MagicMock()
+        mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
+
+        with patch.object(type(mixin), "client", new_callable=PropertyMock, return_value=mock_client):
+            with patch("llama_stack.providers.utils.inference.openai_mixin.localize_image_content") as mock_localize:
+                await mixin.openai_chat_completion(model="test-model", messages=[message])
+
+            mock_localize.assert_not_called()
+
+            mock_client.chat.completions.create.assert_called_once()
+            call_args = mock_client.chat.completions.create.call_args
+            processed_messages = call_args[1]["messages"]
+            assert len(processed_messages) == 1
+            content = processed_messages[0]["content"]
+            assert len(content) == 2
+            assert content[1]["image_url"]["url"] == "http://example.com/image.jpg"
+
+
 class TestOpenAIMixinEmbeddingModelMetadata:
    """Test cases for embedding_model_metadata attribute functionality"""