Merge b1cbfe99f9 into sapling-pr-archive-ehhuang

2025-12-06 02:30:58 +00:00 · 2025-09-29 15:52:57 -07:00 · 2025-09-29 15:52:57 -07:00 · 91898e6598
commit 91898e6598
parent 9ed2425e92 b1cbfe99f9
81 changed files with 51742 additions and 2402 deletions
--- a/.github/workflows/conformance.yml
+++ b/.github/workflows/conformance.yml
@ -43,7 +43,7 @@ jobs:
      # Cache oasdiff to avoid checksum failures and speed up builds
      - name: Cache oasdiff
        id: cache-oasdiff
-        uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809
+        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830
        with:
          path: ~/oasdiff
          key: oasdiff-${{ runner.os }}
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -4,6 +4,8 @@ include llama_stack/models/llama/llama4/tokenizer.model
 include llama_stack/core/*.sh
 include llama_stack/cli/scripts/*.sh
 include llama_stack/distributions/*/*.yaml
-include llama_stack/providers/tests/test_cases/inference/*.json
+exclude llama_stack/distributions/ci-tests
 include tests/integration/test_cases/inference/*.json
 include llama_stack/models/llama/*/*.md
 include llama_stack/tests/integration/*.jpg
 prune llama_stack/distributions/ci-tests
--- a/docs/docs/references/python_sdk_reference/index.md
+++ b/docs/docs/references/python_sdk_reference/index.md
@ -139,18 +139,7 @@ Methods:
 - <code title="post /v1/agents/{agent_id}/session/{session_id}/turn">client.agents.turn.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/agents/turn.py">create</a>(session_id, \*, agent_id, \*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/agents/turn_create_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/agents/turn_create_response.py">TurnCreateResponse</a></code>
 - <code title="get /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}">client.agents.turn.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/agents/turn.py">retrieve</a>(turn_id, \*, agent_id, session_id) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/agents/turn.py">Turn</a></code>
 ## BatchInference
 Types:
 ```python
 from llama_stack_client.types import BatchInferenceChatCompletionResponse
 ```
 Methods:
 - <code title="post /v1/batch-inference/chat-completion">client.batch_inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/batch_inference.py">chat_completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/batch_inference_chat_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/batch_inference_chat_completion_response.py">BatchInferenceChatCompletionResponse</a></code>
 - <code title="post /v1/batch-inference/completion">client.batch_inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/batch_inference.py">completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/batch_inference_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/shared/batch_completion.py">BatchCompletion</a></code>
 ## Datasets
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -548,7 +548,6 @@ class Generator:
        if op.defining_class.__name__ in [
            "SyntheticDataGeneration",
            "PostTraining",
            "BatchInference",
        ]:
            op.defining_class.__name__ = f"{op.defining_class.__name__} (Coming Soon)"
            print(op.defining_class.__name__)
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
@ -87,94 +87,6 @@
                }
            }
        },
        "/v1/inference/batch-chat-completion": {
            "post": {
                "responses": {
                    "200": {
                        "description": "A BatchChatCompletionResponse with the full completions.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/BatchChatCompletionResponse"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Inference"
                ],
                "summary": "Generate chat completions for a batch of messages using the specified model.",
                "description": "Generate chat completions for a batch of messages using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/BatchChatCompletionRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/inference/batch-completion": {
            "post": {
                "responses": {
                    "200": {
                        "description": "A BatchCompletionResponse with the full completions.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/BatchCompletionResponse"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Inference"
                ],
                "summary": "Generate completions for a batch of content using the specified model.",
                "description": "Generate completions for a batch of content using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/BatchCompletionRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1alpha/post-training/job/cancel": {
            "post": {
                "responses": {
@ -281,7 +193,7 @@
                    }
                },
                "tags": [
-                    "BatchInference (Coming Soon)"
+                    "Inference"
                ],
                "summary": "Generate a chat completion for the given messages using the specified model.",
                "description": "Generate a chat completion for the given messages using the specified model.",
@ -298,55 +210,6 @@
                }
            }
        },
        "/v1/inference/completion": {
            "post": {
                "responses": {
                    "200": {
                        "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/CompletionResponse"
                                }
                            },
                            "text/event-stream": {
                                "schema": {
                                    "$ref": "#/components/schemas/CompletionResponseStreamChunk"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "BatchInference (Coming Soon)"
                ],
                "summary": "Generate a completion for the given content using the specified model.",
                "description": "Generate a completion for the given content using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/CompletionRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/agents": {
            "get": {
                "responses": {
@ -6346,6 +6209,20 @@
                ],
                "title": "AppendRowsRequest"
            },
            "CancelTrainingJobRequest": {
                "type": "object",
                "properties": {
                    "job_uuid": {
                        "type": "string",
                        "description": "The UUID of the job to cancel."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "job_uuid"
                ],
                "title": "CancelTrainingJobRequest"
            },
            "CompletionMessage": {
                "type": "object",
                "properties": {
@ -6906,6 +6783,31 @@
                        "type": "boolean",
                        "default": true
                    },
                    "items": {
                        "oneOf": [
                            {
                                "type": "null"
                            },
                            {
                                "type": "boolean"
                            },
                            {
                                "type": "number"
                            },
                            {
                                "type": "string"
                            },
                            {
                                "type": "array"
                            },
                            {
                                "type": "object"
                            }
                        ]
                    },
                    "title": {
                        "type": "string"
                    },
                    "default": {
                        "oneOf": [
                            {
@ -7051,26 +6953,23 @@
                "title": "UserMessage",
                "description": "A message from the user in a chat conversation."
            },
-            "BatchChatCompletionRequest": {
+            "ChatCompletionRequest": {
                "type": "object",
                "properties": {
                    "model_id": {
                        "type": "string",
                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
                    },
-                    "messages_batch": {
+                    "messages": {
                        "type": "array",
                        "items": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/Message"
                            }
                        },
-                        "description": "The messages to generate completions for."
+                        "description": "List of messages in the conversation."
                    },
                    "sampling_params": {
                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "(Optional) Parameters to control the sampling strategy."
+                        "description": "Parameters to control the sampling strategy."
                    },
                    "tools": {
                        "type": "array",
@ -7079,13 +6978,31 @@
                        },
                        "description": "(Optional) List of tool definitions available to the model."
                    },
-                    "tool_config": {
+                    "tool_choice": {
-                        "$ref": "#/components/schemas/ToolConfig",
+                        "type": "string",
-                        "description": "(Optional) Configuration for tool use."
+                        "enum": [
                            "auto",
                            "required",
                            "none"
                        ],
                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead."
                    },
                    "tool_prompt_format": {
                        "type": "string",
                        "enum": [
                            "json",
                            "function_tag",
                            "python_list"
                        ],
                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead."
                    },
                    "response_format": {
                        "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding."
+                        "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
                    },
                    "stream": {
                        "type": "boolean",
                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
                    },
                    "logprobs": {
                        "type": "object",
@ -7098,32 +7015,18 @@
                        },
                        "additionalProperties": false,
                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
                    },
                    "tool_config": {
                        "$ref": "#/components/schemas/ToolConfig",
                        "description": "(Optional) Configuration for tool use."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "model_id",
-                    "messages_batch"
+                    "messages"
                ],
-                "title": "BatchChatCompletionRequest"
+                "title": "ChatCompletionRequest"
            },
            "BatchChatCompletionResponse": {
                "type": "object",
                "properties": {
                    "batch": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/ChatCompletionResponse"
                        },
                        "description": "List of chat completion responses, one for each conversation in the batch"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "batch"
                ],
                "title": "BatchChatCompletionResponse",
                "description": "Response from a batch chat completion request."
            },
            "ChatCompletionResponse": {
                "type": "object",
@ -7203,194 +7106,6 @@
                "title": "TokenLogProbs",
                "description": "Log probabilities for generated tokens."
            },
            "BatchCompletionRequest": {
                "type": "object",
                "properties": {
                    "model_id": {
                        "type": "string",
                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
                    },
                    "content_batch": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/InterleavedContent"
                        },
                        "description": "The content to generate completions for."
                    },
                    "sampling_params": {
                        "$ref": "#/components/schemas/SamplingParams",
                        "description": "(Optional) Parameters to control the sampling strategy."
                    },
                    "response_format": {
                        "$ref": "#/components/schemas/ResponseFormat",
                        "description": "(Optional) Grammar specification for guided (structured) decoding."
                    },
                    "logprobs": {
                        "type": "object",
                        "properties": {
                            "top_k": {
                                "type": "integer",
                                "default": 0,
                                "description": "How many tokens (for each position) to return log probabilities for."
                            }
                        },
                        "additionalProperties": false,
                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "model_id",
                    "content_batch"
                ],
                "title": "BatchCompletionRequest"
            },
            "BatchCompletionResponse": {
                "type": "object",
                "properties": {
                    "batch": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/CompletionResponse"
                        },
                        "description": "List of completion responses, one for each input in the batch"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "batch"
                ],
                "title": "BatchCompletionResponse",
                "description": "Response from a batch completion request."
            },
            "CompletionResponse": {
                "type": "object",
                "properties": {
                    "metrics": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/MetricInResponse"
                        },
                        "description": "(Optional) List of metrics associated with the API response"
                    },
                    "content": {
                        "type": "string",
                        "description": "The generated completion text"
                    },
                    "stop_reason": {
                        "type": "string",
                        "enum": [
                            "end_of_turn",
                            "end_of_message",
                            "out_of_tokens"
                        ],
                        "description": "Reason why generation stopped"
                    },
                    "logprobs": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/TokenLogProbs"
                        },
                        "description": "Optional log probabilities for generated tokens"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "content",
                    "stop_reason"
                ],
                "title": "CompletionResponse",
                "description": "Response from a completion request."
            },
            "CancelTrainingJobRequest": {
                "type": "object",
                "properties": {
                    "job_uuid": {
                        "type": "string",
                        "description": "The UUID of the job to cancel."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "job_uuid"
                ],
                "title": "CancelTrainingJobRequest"
            },
            "ChatCompletionRequest": {
                "type": "object",
                "properties": {
                    "model_id": {
                        "type": "string",
                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
                    },
                    "messages": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/Message"
                        },
                        "description": "List of messages in the conversation."
                    },
                    "sampling_params": {
                        "$ref": "#/components/schemas/SamplingParams",
                        "description": "Parameters to control the sampling strategy."
                    },
                    "tools": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/ToolDefinition"
                        },
                        "description": "(Optional) List of tool definitions available to the model."
                    },
                    "tool_choice": {
                        "type": "string",
                        "enum": [
                            "auto",
                            "required",
                            "none"
                        ],
                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead."
                    },
                    "tool_prompt_format": {
                        "type": "string",
                        "enum": [
                            "json",
                            "function_tag",
                            "python_list"
                        ],
                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead."
                    },
                    "response_format": {
                        "$ref": "#/components/schemas/ResponseFormat",
                        "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
                    },
                    "stream": {
                        "type": "boolean",
                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
                    },
                    "logprobs": {
                        "type": "object",
                        "properties": {
                            "top_k": {
                                "type": "integer",
                                "default": 0,
                                "description": "How many tokens (for each position) to return log probabilities for."
                            }
                        },
                        "additionalProperties": false,
                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
                    },
                    "tool_config": {
                        "$ref": "#/components/schemas/ToolConfig",
                        "description": "(Optional) Configuration for tool use."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "model_id",
                    "messages"
                ],
                "title": "ChatCompletionRequest"
            },
            "ChatCompletionResponseEvent": {
                "type": "object",
                "properties": {
@ -7560,87 +7275,6 @@
                "title": "ToolCallDelta",
                "description": "A tool call content delta for streaming responses."
            },
            "CompletionRequest": {
                "type": "object",
                "properties": {
                    "model_id": {
                        "type": "string",
                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
                    },
                    "content": {
                        "$ref": "#/components/schemas/InterleavedContent",
                        "description": "The content to generate a completion for."
                    },
                    "sampling_params": {
                        "$ref": "#/components/schemas/SamplingParams",
                        "description": "(Optional) Parameters to control the sampling strategy."
                    },
                    "response_format": {
                        "$ref": "#/components/schemas/ResponseFormat",
                        "description": "(Optional) Grammar specification for guided (structured) decoding."
                    },
                    "stream": {
                        "type": "boolean",
                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
                    },
                    "logprobs": {
                        "type": "object",
                        "properties": {
                            "top_k": {
                                "type": "integer",
                                "default": 0,
                                "description": "How many tokens (for each position) to return log probabilities for."
                            }
                        },
                        "additionalProperties": false,
                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "model_id",
                    "content"
                ],
                "title": "CompletionRequest"
            },
            "CompletionResponseStreamChunk": {
                "type": "object",
                "properties": {
                    "metrics": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/MetricInResponse"
                        },
                        "description": "(Optional) List of metrics associated with the API response"
                    },
                    "delta": {
                        "type": "string",
                        "description": "New content generated since last chunk. This can be one or more tokens."
                    },
                    "stop_reason": {
                        "type": "string",
                        "enum": [
                            "end_of_turn",
                            "end_of_message",
                            "out_of_tokens"
                        ],
                        "description": "Optional reason why generation stopped, if complete"
                    },
                    "logprobs": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/TokenLogProbs"
                        },
                        "description": "Optional log probabilities for generated tokens"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "delta"
                ],
                "title": "CompletionResponseStreamChunk",
                "description": "A chunk of a streamed completion response."
            },
            "AgentConfig": {
                "type": "object",
                "properties": {
@ -7848,6 +7482,14 @@
                        "default": true,
                        "description": "Whether this parameter is required for tool invocation"
                    },
                    "items": {
                        "type": "object",
                        "description": "Type of the elements when parameter_type is array"
                    },
                    "title": {
                        "type": "string",
                        "description": "(Optional) Title of the parameter"
                    },
                    "default": {
                        "oneOf": [
                            {
@ -18779,11 +18421,6 @@
            "description": "Main functionalities provided by this API:\n- Create agents with specific instructions and ability to use tools.\n- Interactions with agents are grouped into sessions (\"threads\"), and each interaction is called a \"turn\".\n- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).\n- Agents can be provided with various shields (see the Safety API for more details).\n- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.",
            "x-displayName": "Agents API for creating and interacting with agentic systems."
        },
        {
            "name": "BatchInference (Coming Soon)",
            "description": "This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.\n\nNOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs\nincluding (post-training, evals, etc).",
            "x-displayName": "Batch inference API for generating completions and chat completions."
        },
        {
            "name": "Benchmarks"
        },
@ -18858,7 +18495,6 @@
            "name": "Operations",
            "tags": [
                "Agents",
                "BatchInference (Coming Soon)",
                "Benchmarks",
                "DatasetIO",
                "Datasets",
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -43,72 +43,6 @@ paths:
            schema:
              $ref: '#/components/schemas/AppendRowsRequest'
        required: true
  /v1/inference/batch-chat-completion:
    post:
      responses:
        '200':
          description: >-
            A BatchChatCompletionResponse with the full completions.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/BatchChatCompletionResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
      summary: >-
        Generate chat completions for a batch of messages using the specified model.
      description: >-
        Generate chat completions for a batch of messages using the specified model.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/BatchChatCompletionRequest'
        required: true
  /v1/inference/batch-completion:
    post:
      responses:
        '200':
          description: >-
            A BatchCompletionResponse with the full completions.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/BatchCompletionResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
      summary: >-
        Generate completions for a batch of content using the specified model.
      description: >-
        Generate completions for a batch of content using the specified model.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/BatchCompletionRequest'
        required: true
  /v1alpha/post-training/job/cancel:
    post:
      responses:
@ -186,7 +120,7 @@ paths:
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
-        - BatchInference (Coming Soon)
+        - Inference
      summary: >-
        Generate a chat completion for the given messages using the specified model.
      description: >-
@ -198,43 +132,6 @@ paths:
            schema:
              $ref: '#/components/schemas/ChatCompletionRequest'
        required: true
  /v1/inference/completion:
    post:
      responses:
        '200':
          description: >-
            If stream=False, returns a CompletionResponse with the full completion.
            If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/CompletionResponse'
            text/event-stream:
              schema:
                $ref: '#/components/schemas/CompletionResponseStreamChunk'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - BatchInference (Coming Soon)
      summary: >-
        Generate a completion for the given content using the specified model.
      description: >-
        Generate a completion for the given content using the specified model.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CompletionRequest'
        required: true
  /v1/agents:
    get:
      responses:
@ -4559,6 +4456,16 @@ components:
      required:
        - rows
      title: AppendRowsRequest
    CancelTrainingJobRequest:
      type: object
      properties:
        job_uuid:
          type: string
          description: The UUID of the job to cancel.
      additionalProperties: false
      required:
        - job_uuid
      title: CancelTrainingJobRequest
    CompletionMessage:
      type: object
      properties:
@ -4959,6 +4866,16 @@ components:
        required:
          type: boolean
          default: true
        items:
          oneOf:
            - type: 'null'
            - type: boolean
            - type: number
            - type: string
            - type: array
            - type: object
        title:
          type: string
        default:
          oneOf:
            - type: 'null'
@ -5076,224 +4993,6 @@ components:
      title: UserMessage
      description: >-
        A message from the user in a chat conversation.
    BatchChatCompletionRequest:
      type: object
      properties:
        model_id:
          type: string
          description: >-
            The identifier of the model to use. The model must be registered with
            Llama Stack and available via the /models endpoint.
        messages_batch:
          type: array
          items:
            type: array
            items:
              $ref: '#/components/schemas/Message'
          description: >-
            The messages to generate completions for.
        sampling_params:
          $ref: '#/components/schemas/SamplingParams'
          description: >-
            (Optional) Parameters to control the sampling strategy.
        tools:
          type: array
          items:
            $ref: '#/components/schemas/ToolDefinition'
          description: >-
            (Optional) List of tool definitions available to the model.
        tool_config:
          $ref: '#/components/schemas/ToolConfig'
          description: (Optional) Configuration for tool use.
        response_format:
          $ref: '#/components/schemas/ResponseFormat'
          description: >-
            (Optional) Grammar specification for guided (structured) decoding.
        logprobs:
          type: object
          properties:
            top_k:
              type: integer
              default: 0
              description: >-
                How many tokens (for each position) to return log probabilities for.
          additionalProperties: false
          description: >-
            (Optional) If specified, log probabilities for each token position will
            be returned.
      additionalProperties: false
      required:
        - model_id
        - messages_batch
      title: BatchChatCompletionRequest
    BatchChatCompletionResponse:
      type: object
      properties:
        batch:
          type: array
          items:
            $ref: '#/components/schemas/ChatCompletionResponse'
          description: >-
            List of chat completion responses, one for each conversation in the batch
      additionalProperties: false
      required:
        - batch
      title: BatchChatCompletionResponse
      description: >-
        Response from a batch chat completion request.
    ChatCompletionResponse:
      type: object
      properties:
        metrics:
          type: array
          items:
            $ref: '#/components/schemas/MetricInResponse'
          description: >-
            (Optional) List of metrics associated with the API response
        completion_message:
          $ref: '#/components/schemas/CompletionMessage'
          description: The complete response message
        logprobs:
          type: array
          items:
            $ref: '#/components/schemas/TokenLogProbs'
          description: >-
            Optional log probabilities for generated tokens
      additionalProperties: false
      required:
        - completion_message
      title: ChatCompletionResponse
      description: Response from a chat completion request.
    MetricInResponse:
      type: object
      properties:
        metric:
          type: string
          description: The name of the metric
        value:
          oneOf:
            - type: integer
            - type: number
          description: The numeric value of the metric
        unit:
          type: string
          description: >-
            (Optional) The unit of measurement for the metric value
      additionalProperties: false
      required:
        - metric
        - value
      title: MetricInResponse
      description: >-
        A metric value included in API responses.
    TokenLogProbs:
      type: object
      properties:
        logprobs_by_token:
          type: object
          additionalProperties:
            type: number
          description: >-
            Dictionary mapping tokens to their log probabilities
      additionalProperties: false
      required:
        - logprobs_by_token
      title: TokenLogProbs
      description: Log probabilities for generated tokens.
    BatchCompletionRequest:
      type: object
      properties:
        model_id:
          type: string
          description: >-
            The identifier of the model to use. The model must be registered with
            Llama Stack and available via the /models endpoint.
        content_batch:
          type: array
          items:
            $ref: '#/components/schemas/InterleavedContent'
          description: The content to generate completions for.
        sampling_params:
          $ref: '#/components/schemas/SamplingParams'
          description: >-
            (Optional) Parameters to control the sampling strategy.
        response_format:
          $ref: '#/components/schemas/ResponseFormat'
          description: >-
            (Optional) Grammar specification for guided (structured) decoding.
        logprobs:
          type: object
          properties:
            top_k:
              type: integer
              default: 0
              description: >-
                How many tokens (for each position) to return log probabilities for.
          additionalProperties: false
          description: >-
            (Optional) If specified, log probabilities for each token position will
            be returned.
      additionalProperties: false
      required:
        - model_id
        - content_batch
      title: BatchCompletionRequest
    BatchCompletionResponse:
      type: object
      properties:
        batch:
          type: array
          items:
            $ref: '#/components/schemas/CompletionResponse'
          description: >-
            List of completion responses, one for each input in the batch
      additionalProperties: false
      required:
        - batch
      title: BatchCompletionResponse
      description: >-
        Response from a batch completion request.
    CompletionResponse:
      type: object
      properties:
        metrics:
          type: array
          items:
            $ref: '#/components/schemas/MetricInResponse'
          description: >-
            (Optional) List of metrics associated with the API response
        content:
          type: string
          description: The generated completion text
        stop_reason:
          type: string
          enum:
            - end_of_turn
            - end_of_message
            - out_of_tokens
          description: Reason why generation stopped
        logprobs:
          type: array
          items:
            $ref: '#/components/schemas/TokenLogProbs'
          description: >-
            Optional log probabilities for generated tokens
      additionalProperties: false
      required:
        - content
        - stop_reason
      title: CompletionResponse
      description: Response from a completion request.
    CancelTrainingJobRequest:
      type: object
      properties:
        job_uuid:
          type: string
          description: The UUID of the job to cancel.
      additionalProperties: false
      required:
        - job_uuid
      title: CancelTrainingJobRequest
    ChatCompletionRequest:
      type: object
      properties:
@ -5372,6 +5071,65 @@ components:
        - model_id
        - messages
      title: ChatCompletionRequest
    ChatCompletionResponse:
      type: object
      properties:
        metrics:
          type: array
          items:
            $ref: '#/components/schemas/MetricInResponse'
          description: >-
            (Optional) List of metrics associated with the API response
        completion_message:
          $ref: '#/components/schemas/CompletionMessage'
          description: The complete response message
        logprobs:
          type: array
          items:
            $ref: '#/components/schemas/TokenLogProbs'
          description: >-
            Optional log probabilities for generated tokens
      additionalProperties: false
      required:
        - completion_message
      title: ChatCompletionResponse
      description: Response from a chat completion request.
    MetricInResponse:
      type: object
      properties:
        metric:
          type: string
          description: The name of the metric
        value:
          oneOf:
            - type: integer
            - type: number
          description: The numeric value of the metric
        unit:
          type: string
          description: >-
            (Optional) The unit of measurement for the metric value
      additionalProperties: false
      required:
        - metric
        - value
      title: MetricInResponse
      description: >-
        A metric value included in API responses.
    TokenLogProbs:
      type: object
      properties:
        logprobs_by_token:
          type: object
          additionalProperties:
            type: number
          description: >-
            Dictionary mapping tokens to their log probabilities
      additionalProperties: false
      required:
        - logprobs_by_token
      title: TokenLogProbs
      description: Log probabilities for generated tokens.
    ChatCompletionResponseEvent:
      type: object
      properties:
@ -5507,81 +5265,6 @@ components:
      title: ToolCallDelta
      description: >-
        A tool call content delta for streaming responses.
    CompletionRequest:
      type: object
      properties:
        model_id:
          type: string
          description: >-
            The identifier of the model to use. The model must be registered with
            Llama Stack and available via the /models endpoint.
        content:
          $ref: '#/components/schemas/InterleavedContent'
          description: >-
            The content to generate a completion for.
        sampling_params:
          $ref: '#/components/schemas/SamplingParams'
          description: >-
            (Optional) Parameters to control the sampling strategy.
        response_format:
          $ref: '#/components/schemas/ResponseFormat'
          description: >-
            (Optional) Grammar specification for guided (structured) decoding.
        stream:
          type: boolean
          description: >-
            (Optional) If True, generate an SSE event stream of the response. Defaults
            to False.
        logprobs:
          type: object
          properties:
            top_k:
              type: integer
              default: 0
              description: >-
                How many tokens (for each position) to return log probabilities for.
          additionalProperties: false
          description: >-
            (Optional) If specified, log probabilities for each token position will
            be returned.
      additionalProperties: false
      required:
        - model_id
        - content
      title: CompletionRequest
    CompletionResponseStreamChunk:
      type: object
      properties:
        metrics:
          type: array
          items:
            $ref: '#/components/schemas/MetricInResponse'
          description: >-
            (Optional) List of metrics associated with the API response
        delta:
          type: string
          description: >-
            New content generated since last chunk. This can be one or more tokens.
        stop_reason:
          type: string
          enum:
            - end_of_turn
            - end_of_message
            - out_of_tokens
          description: >-
            Optional reason why generation stopped, if complete
        logprobs:
          type: array
          items:
            $ref: '#/components/schemas/TokenLogProbs'
          description: >-
            Optional log probabilities for generated tokens
      additionalProperties: false
      required:
        - delta
      title: CompletionResponseStreamChunk
      description: >-
        A chunk of a streamed completion response.
    AgentConfig:
      type: object
      properties:
@ -5730,6 +5413,13 @@ components:
          default: true
          description: >-
            Whether this parameter is required for tool invocation
        items:
          type: object
          description: >-
            Type of the elements when parameter_type is array
        title:
          type: string
          description: (Optional) Title of the parameter
        default:
          oneOf:
            - type: 'null'
@ -13983,18 +13673,6 @@ tags:
      the RAG Tool and Vector IO APIs for more details.
    x-displayName: >-
      Agents API for creating and interacting with agentic systems.
  - name: BatchInference (Coming Soon)
    description: >-
      This is an asynchronous API. If the request is successful, the response will
      be a job which can be polled for completion.
      NOTE: This API is not yet implemented and is subject to change in concert with
      other asynchronous APIs
      including (post-training, evals, etc).
    x-displayName: >-
      Batch inference API for generating completions and chat completions.
  - name: Benchmarks
  - name: DatasetIO
  - name: Datasets
@ -14037,7 +13715,6 @@ x-tagGroups:
  - name: Operations
    tags:
      - Agents
      - BatchInference (Coming Soon)
      - Benchmarks
      - DatasetIO
      - Datasets
--- a/llama_stack/apis/batch_inference/batch_inference.py
+++ b/llama_stack/apis/batch_inference/batch_inference.py
@ -1,79 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Protocol, runtime_checkable
 from llama_stack.apis.common.job_types import Job
 from llama_stack.apis.inference import (
    InterleavedContent,
    LogProbConfig,
    Message,
    ResponseFormat,
    SamplingParams,
    ToolChoice,
    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.schema_utils import webmethod
@runtime_checkable
 class BatchInference(Protocol):
    """Batch inference API for generating completions and chat completions.
    This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.
    NOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs
    including (post-training, evals, etc).
    """
    @webmethod(route="/batch-inference/completion", method="POST", level=LLAMA_STACK_API_V1)
    async def completion(
        self,
        model: str,
        content_batch: list[InterleavedContent],
        sampling_params: SamplingParams | None = None,
        response_format: ResponseFormat | None = None,
        logprobs: LogProbConfig | None = None,
    ) -> Job:
        """Generate completions for a batch of content.
        :param model: The model to use for the completion.
        :param content_batch: The content to complete.
        :param sampling_params: The sampling parameters to use for the completion.
        :param response_format: The response format to use for the completion.
        :param logprobs: The logprobs to use for the completion.
        :returns: A job for the completion.
        """
        ...
    @webmethod(route="/batch-inference/chat-completion", method="POST", level=LLAMA_STACK_API_V1)
    async def chat_completion(
        self,
        model: str,
        messages_batch: list[list[Message]],
        sampling_params: SamplingParams | None = None,
        # zero-shot tool definitions as input to the model
        tools: list[ToolDefinition] | None = None,
        tool_choice: ToolChoice | None = ToolChoice.auto,
        tool_prompt_format: ToolPromptFormat | None = None,
        response_format: ResponseFormat | None = None,
        logprobs: LogProbConfig | None = None,
    ) -> Job:
        """Generate chat completions for a batch of messages.
        :param model: The model to use for the chat completion.
        :param messages_batch: The messages to complete.
        :param sampling_params: The sampling parameters to use for the completion.
        :param tools: The tools to use for the chat completion.
        :param tool_choice: The tool choice to use for the chat completion.
        :param tool_prompt_format: The tool prompt format to use for the chat completion.
        :param response_format: The response format to use for the chat completion.
        :param logprobs: The logprobs to use for the chat completion.
        :returns: A job for the chat completion.
        """
        ...
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -975,26 +975,6 @@ class EmbeddingTaskType(Enum):
    document = "document"
@json_schema_type
 class BatchCompletionResponse(BaseModel):
    """Response from a batch completion request.
    :param batch: List of completion responses, one for each input in the batch
    """
    batch: list[CompletionResponse]
@json_schema_type
 class BatchChatCompletionResponse(BaseModel):
    """Response from a batch chat completion request.
    :param batch: List of chat completion responses, one for each conversation in the batch
    """
    batch: list[ChatCompletionResponse]
 class OpenAICompletionWithInputMessages(OpenAIChatCompletion):
    input_messages: list[OpenAIMessageParam]
@ -1028,7 +1008,6 @@ class InferenceProvider(Protocol):
    model_store: ModelStore | None = None
    @webmethod(route="/inference/completion", method="POST", level=LLAMA_STACK_API_V1)
    async def completion(
        self,
        model_id: str,
@ -1051,27 +1030,6 @@ class InferenceProvider(Protocol):
        """
        ...
    @webmethod(route="/inference/batch-completion", method="POST", experimental=True, level=LLAMA_STACK_API_V1)
    async def batch_completion(
        self,
        model_id: str,
        content_batch: list[InterleavedContent],
        sampling_params: SamplingParams | None = None,
        response_format: ResponseFormat | None = None,
        logprobs: LogProbConfig | None = None,
    ) -> BatchCompletionResponse:
        """Generate completions for a batch of content using the specified model.
        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
        :param content_batch: The content to generate completions for.
        :param sampling_params: (Optional) Parameters to control the sampling strategy.
        :param response_format: (Optional) Grammar specification for guided (structured) decoding.
        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
        :returns: A BatchCompletionResponse with the full completions.
        """
        raise NotImplementedError("Batch completion is not implemented")
        return  # this is so mypy's safe-super rule will consider the method concrete
    @webmethod(route="/inference/chat-completion", method="POST", level=LLAMA_STACK_API_V1)
    async def chat_completion(
        self,
@ -1112,31 +1070,6 @@ class InferenceProvider(Protocol):
        """
        ...
    @webmethod(route="/inference/batch-chat-completion", method="POST", experimental=True, level=LLAMA_STACK_API_V1)
    async def batch_chat_completion(
        self,
        model_id: str,
        messages_batch: list[list[Message]],
        sampling_params: SamplingParams | None = None,
        tools: list[ToolDefinition] | None = None,
        tool_config: ToolConfig | None = None,
        response_format: ResponseFormat | None = None,
        logprobs: LogProbConfig | None = None,
    ) -> BatchChatCompletionResponse:
        """Generate chat completions for a batch of messages using the specified model.
        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
        :param messages_batch: The messages to generate completions for.
        :param sampling_params: (Optional) Parameters to control the sampling strategy.
        :param tools: (Optional) List of tool definitions available to the model.
        :param tool_config: (Optional) Configuration for tool use.
        :param response_format: (Optional) Grammar specification for guided (structured) decoding.
        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
        :returns: A BatchChatCompletionResponse with the full completions.
        """
        raise NotImplementedError("Batch chat completion is not implemented")
        return  # this is so mypy's safe-super rule will consider the method concrete
    @webmethod(route="/inference/embeddings", method="POST", level=LLAMA_STACK_API_V1)
    async def embeddings(
        self,
--- a/llama_stack/apis/tools/tools.py
+++ b/llama_stack/apis/tools/tools.py
@ -27,6 +27,8 @@ class ToolParameter(BaseModel):
    :param parameter_type: Type of the parameter (e.g., string, integer)
    :param description: Human-readable description of what the parameter does
    :param required: Whether this parameter is required for tool invocation
    :param items: Type of the elements when parameter_type is array
    :param title: (Optional) Title of the parameter
    :param default: (Optional) Default value for the parameter if not provided
    """
@ -34,6 +36,8 @@ class ToolParameter(BaseModel):
    parameter_type: str
    description: str
    required: bool = Field(default=True)
    items: dict | None = None
    title: str | None = None
    default: Any | None = None
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@ -20,8 +20,6 @@ from llama_stack.apis.common.content_types import (
 )
 from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
 from llama_stack.apis.inference import (
    BatchChatCompletionResponse,
    BatchCompletionResponse,
    ChatCompletionResponse,
    ChatCompletionResponseEventType,
    ChatCompletionResponseStreamChunk,
@ -273,30 +271,6 @@ class InferenceRouter(Inference):
        )
        return response
    async def batch_chat_completion(
        self,
        model_id: str,
        messages_batch: list[list[Message]],
        tools: list[ToolDefinition] | None = None,
        tool_config: ToolConfig | None = None,
        sampling_params: SamplingParams | None = None,
        response_format: ResponseFormat | None = None,
        logprobs: LogProbConfig | None = None,
    ) -> BatchChatCompletionResponse:
        logger.debug(
            f"InferenceRouter.batch_chat_completion: {model_id=}, {len(messages_batch)=}, {sampling_params=}, {response_format=}, {logprobs=}",
        )
        provider = await self.routing_table.get_provider_impl(model_id)
        return await provider.batch_chat_completion(
            model_id=model_id,
            messages_batch=messages_batch,
            tools=tools,
            tool_config=tool_config,
            sampling_params=sampling_params,
            response_format=response_format,
            logprobs=logprobs,
        )
    async def completion(
        self,
        model_id: str,
@ -338,20 +312,6 @@ class InferenceRouter(Inference):
        return response
    async def batch_completion(
        self,
        model_id: str,
        content_batch: list[InterleavedContent],
        sampling_params: SamplingParams | None = None,
        response_format: ResponseFormat | None = None,
        logprobs: LogProbConfig | None = None,
    ) -> BatchCompletionResponse:
        logger.debug(
            f"InferenceRouter.batch_completion: {model_id=}, {len(content_batch)=}, {sampling_params=}, {response_format=}, {logprobs=}",
        )
        provider = await self.routing_table.get_provider_impl(model_id)
        return await provider.batch_completion(model_id, content_batch, sampling_params, response_format, logprobs)
    async def embeddings(
        self,
        model_id: str,
--- a/llama_stack/core/stack.py
+++ b/llama_stack/core/stack.py
@ -14,7 +14,6 @@ from typing import Any
 import yaml
 from llama_stack.apis.agents import Agents
 from llama_stack.apis.batch_inference import BatchInference
 from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
@ -54,7 +53,6 @@ class LlamaStack(
    Providers,
    VectorDBs,
    Inference,
    BatchInference,
    Agents,
    Safety,
    SyntheticDataGeneration,
--- a/llama_stack/models/llama/datatypes.py
+++ b/llama_stack/models/llama/datatypes.py
@ -92,6 +92,8 @@ class ToolParamDefinition(BaseModel):
    param_type: str
    description: str | None = None
    required: bool | None = True
    items: Any | None = None
    title: str | None = None
    default: Any | None = None
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@ -798,6 +798,8 @@ class ChatAgent(ShieldRunnerMixin):
                        param_type=param.parameter_type,
                        description=param.description,
                        required=param.required,
                        items=param.items,
                        title=param.title,
                        default=param.default,
                    )
                    for param in tool_def.parameters
@ -841,6 +843,8 @@ class ChatAgent(ShieldRunnerMixin):
                                param_type=param.parameter_type,
                                description=param.description,
                                required=param.required,
                                items=param.items,
                                title=param.title,
                                default=param.default,
                            )
                            for param in tool_def.parameters
@ -920,7 +924,7 @@ async def get_raw_document_text(document: Document) -> str:
            DeprecationWarning,
            stacklevel=2,
        )
-    elif not (document.mime_type.startswith("text/") or document.mime_type == "application/yaml"):
+    elif not (document.mime_type.startswith("text/") or document.mime_type in ("application/yaml", "application/json")):
        raise ValueError(f"Unexpected document mime type: {document.mime_type}")
    if isinstance(document.content, URL):
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@ -568,6 +568,7 @@ class StreamingResponseOrchestrator:
                                description=param.description,
                                required=param.required,
                                default=param.default,
                                items=param.items,
                            )
                            for param in t.parameters
                        },
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -18,8 +18,6 @@ from llama_stack.apis.common.content_types import (
    ToolCallParseStatus,
 )
 from llama_stack.apis.inference import (
    BatchChatCompletionResponse,
    BatchCompletionResponse,
    ChatCompletionRequest,
    ChatCompletionResponse,
    ChatCompletionResponseEvent,
@ -219,41 +217,6 @@ class MetaReferenceInferenceImpl(
            results = await self._nonstream_completion([request])
            return results[0]
    async def batch_completion(
        self,
        model_id: str,
        content_batch: list[InterleavedContent],
        sampling_params: SamplingParams | None = None,
        response_format: ResponseFormat | None = None,
        stream: bool | None = False,
        logprobs: LogProbConfig | None = None,
    ) -> BatchCompletionResponse:
        if sampling_params is None:
            sampling_params = SamplingParams()
        if logprobs:
            assert logprobs.top_k == 1, f"Unexpected top_k={logprobs.top_k}"
        content_batch = [
            augment_content_with_response_format_prompt(response_format, content) for content in content_batch
        ]
        request_batch = []
        for content in content_batch:
            request = CompletionRequest(
                model=model_id,
                content=content,
                sampling_params=sampling_params,
                response_format=response_format,
                stream=stream,
                logprobs=logprobs,
            )
            self.check_model(request)
            request = await convert_request_to_raw(request)
            request_batch.append(request)
        results = await self._nonstream_completion(request_batch)
        return BatchCompletionResponse(batch=results)
    async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator:
        tokenizer = self.generator.formatter.tokenizer
@ -399,49 +362,6 @@ class MetaReferenceInferenceImpl(
            results = await self._nonstream_chat_completion([request])
            return results[0]
    async def batch_chat_completion(
        self,
        model_id: str,
        messages_batch: list[list[Message]],
        sampling_params: SamplingParams | None = None,
        response_format: ResponseFormat | None = None,
        tools: list[ToolDefinition] | None = None,
        stream: bool | None = False,
        logprobs: LogProbConfig | None = None,
        tool_config: ToolConfig | None = None,
    ) -> BatchChatCompletionResponse:
        if sampling_params is None:
            sampling_params = SamplingParams()
        if logprobs:
            assert logprobs.top_k == 1, f"Unexpected top_k={logprobs.top_k}"
        # wrapper request to make it easier to pass around (internal only, not exposed to API)
        request_batch = []
        for messages in messages_batch:
            request = ChatCompletionRequest(
                model=model_id,
                messages=messages,
                sampling_params=sampling_params,
                tools=tools or [],
                response_format=response_format,
                logprobs=logprobs,
                tool_config=tool_config or ToolConfig(),
            )
            self.check_model(request)
            # augment and rewrite messages depending on the model
            request.messages = chat_completion_request_to_messages(request, self.llama_model.core_model_id.value)
            # download media and convert to raw content so we can send it to the model
            request = await convert_request_to_raw(request)
            request_batch.append(request)
        if self.config.create_distributed_process_group:
            if SEMAPHORE.locked():
                raise RuntimeError("Only one concurrent request is supported")
        results = await self._nonstream_chat_completion(request_batch)
        return BatchChatCompletionResponse(batch=results)
    async def _nonstream_chat_completion(
        self, request_batch: list[ChatCompletionRequest]
    ) -> list[ChatCompletionResponse]:
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -61,6 +61,7 @@ logger = get_logger(name=__name__, category="inference::fireworks")
 class FireworksInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, NeedsRequestProviderData):
    embedding_model_metadata = {
        "nomic-ai/nomic-embed-text-v1.5": {"embedding_dimension": 768, "context_length": 8192},
        "accounts/fireworks/models/qwen3-embedding-8b": {"embedding_dimension": 4096, "context_length": 40960},
    }
    def __init__(self, config: FireworksImplConfig) -> None:
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -6,8 +6,7 @@
 import asyncio
-import base64
+from collections.abc import AsyncGenerator
 from collections.abc import AsyncGenerator, AsyncIterator
 from typing import Any
 from ollama import AsyncClient as AsyncOllamaClient
@ -33,10 +32,6 @@ from llama_stack.apis.inference import (
    JsonSchemaResponseFormat,
    LogProbConfig,
    Message,
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
    OpenAIMessageParam,
    OpenAIResponseFormatParam,
    ResponseFormat,
    SamplingParams,
    TextTruncation,
@ -62,7 +57,6 @@ from llama_stack.providers.utils.inference.openai_compat import (
    OpenAICompatCompletionChoice,
    OpenAICompatCompletionResponse,
    get_sampling_options,
    prepare_openai_completion_params,
    process_chat_completion_response,
    process_chat_completion_stream_response,
    process_completion_response,
@ -75,7 +69,6 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
    content_has_media,
    convert_image_content_to_url,
    interleaved_content_as_str,
    localize_image_content,
    request_has_media,
 )
@ -84,6 +77,7 @@ logger = get_logger(name=__name__, category="inference::ollama")
 class OllamaInferenceAdapter(
    OpenAIMixin,
    ModelRegistryHelper,
    InferenceProvider,
    ModelsProtocolPrivate,
 ):
@ -129,6 +123,8 @@ class OllamaInferenceAdapter(
            ],
        )
        self.config = config
        # Ollama does not support image urls, so we need to download the image and convert it to base64
        self.download_images = True
        self._clients: dict[asyncio.AbstractEventLoop, AsyncOllamaClient] = {}
    @property
@ -173,9 +169,6 @@ class OllamaInferenceAdapter(
    async def shutdown(self) -> None:
        self._clients.clear()
    async def unregister_model(self, model_id: str) -> None:
        pass
    async def _get_model(self, model_id: str) -> Model:
        if not self.model_store:
            raise ValueError("Model store not set")
@ -403,75 +396,6 @@ class OllamaInferenceAdapter(
        raise UnsupportedModelError(model.provider_model_id, list(self._model_cache.keys()))
    async def openai_chat_completion(
        self,
        model: str,
        messages: list[OpenAIMessageParam],
        frequency_penalty: float | None = None,
        function_call: str | dict[str, Any] | None = None,
        functions: list[dict[str, Any]] | None = None,
        logit_bias: dict[str, float] | None = None,
        logprobs: bool | None = None,
        max_completion_tokens: int | None = None,
        max_tokens: int | None = None,
        n: int | None = None,
        parallel_tool_calls: bool | None = None,
        presence_penalty: float | None = None,
        response_format: OpenAIResponseFormatParam | None = None,
        seed: int | None = None,
        stop: str | list[str] | None = None,
        stream: bool | None = None,
        stream_options: dict[str, Any] | None = None,
        temperature: float | None = None,
        tool_choice: str | dict[str, Any] | None = None,
        tools: list[dict[str, Any]] | None = None,
        top_logprobs: int | None = None,
        top_p: float | None = None,
        user: str | None = None,
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
        model_obj = await self._get_model(model)
        # Ollama does not support image urls, so we need to download the image and convert it to base64
        async def _convert_message(m: OpenAIMessageParam) -> OpenAIMessageParam:
            if isinstance(m.content, list):
                for c in m.content:
                    if c.type == "image_url" and c.image_url and c.image_url.url:
                        localize_result = await localize_image_content(c.image_url.url)
                        if localize_result is None:
                            raise ValueError(f"Failed to localize image content from {c.image_url.url}")
                        content, format = localize_result
                        c.image_url.url = f"data:image/{format};base64,{base64.b64encode(content).decode('utf-8')}"
            return m
        messages = [await _convert_message(m) for m in messages]
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
            messages=messages,
            frequency_penalty=frequency_penalty,
            function_call=function_call,
            functions=functions,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_completion_tokens=max_completion_tokens,
            max_tokens=max_tokens,
            n=n,
            parallel_tool_calls=parallel_tool_calls,
            presence_penalty=presence_penalty,
            response_format=response_format,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            tool_choice=tool_choice,
            tools=tools,
            top_logprobs=top_logprobs,
            top_p=top_p,
            user=user,
        )
        return await OpenAIMixin.openai_chat_completion(self, **params)
 async def convert_message_to_openai_dict_for_ollama(message: Message) -> list[dict]:
    async def _convert_content(content) -> dict:
--- a/llama_stack/providers/remote/inference/openai/openai.py
+++ b/llama_stack/providers/remote/inference/openai/openai.py
@ -21,8 +21,6 @@ logger = get_logger(name=__name__, category="inference::openai")
 # | completion                 | LiteLLMOpenAIMixin       |
 # | chat_completion            | LiteLLMOpenAIMixin       |
 # | embedding                  | LiteLLMOpenAIMixin       |
 # | batch_completion           | LiteLLMOpenAIMixin       |
 # | batch_chat_completion      | LiteLLMOpenAIMixin       |
 # | openai_completion          | OpenAIMixin              |
 # | openai_chat_completion     | OpenAIMixin              |
 # | openai_embeddings          | OpenAIMixin              |
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -805,6 +805,10 @@ def convert_tooldef_to_openai_tool(tool: ToolDefinition) -> dict:
                properties[param_name].update(description=param.description)
            if param.default:
                properties[param_name].update(default=param.default)
            if param.items:
                properties[param_name].update(items=param.items)
            if param.title:
                properties[param_name].update(title=param.title)
            if param.required:
                required.append(param_name)
--- a/llama_stack/providers/utils/inference/openai_mixin.py
+++ b/llama_stack/providers/utils/inference/openai_mixin.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import base64
 import uuid
 from abc import ABC, abstractmethod
 from collections.abc import AsyncIterator
@ -26,6 +27,7 @@ from llama_stack.apis.models import ModelType
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
 from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
 from llama_stack.providers.utils.inference.prompt_adapter import localize_image_content
 logger = get_logger(name=__name__, category="providers::utils")
@ -51,6 +53,10 @@ class OpenAIMixin(ModelRegistryHelper, ABC):
    # This is useful for providers that do not return a unique id in the response.
    overwrite_completion_id: bool = False
    # Allow subclasses to control whether to download images and convert to base64
    # for providers that require base64 encoded images instead of URLs.
    download_images: bool = False
    # Embedding model metadata for this provider
    # Can be set by subclasses or instances to provide embedding models
    # Format: {"model_id": {"embedding_dimension": 1536, "context_length": 8192}}
@ -239,6 +245,24 @@ class OpenAIMixin(ModelRegistryHelper, ABC):
        """
        Direct OpenAI chat completion API call.
        """
        if self.download_images:
            async def _localize_image_url(m: OpenAIMessageParam) -> OpenAIMessageParam:
                if isinstance(m.content, list):
                    for c in m.content:
                        if c.type == "image_url" and c.image_url and c.image_url.url and "http" in c.image_url.url:
                            localize_result = await localize_image_content(c.image_url.url)
                            if localize_result is None:
                                raise ValueError(
                                    f"Failed to localize image content from {c.image_url.url[:42]}{'...' if len(c.image_url.url) > 42 else ''}"
                                )
                            content, format = localize_result
                            c.image_url.url = f"data:image/{format};base64,{base64.b64encode(content).decode('utf-8')}"
                # else it's a string and we don't need to modify it
                return m
            messages = [await _localize_image_url(m) for m in messages]
        resp = await self.client.chat.completions.create(
            **await prepare_openai_completion_params(
                model=await self._get_provider_model_id(model),
--- a/llama_stack/providers/utils/inference/prompt_adapter.py
+++ b/llama_stack/providers/utils/inference/prompt_adapter.py
@ -192,6 +192,14 @@ async def localize_image_content(uri: str) -> tuple[bytes, str] | None:
                format = "png"
        return content, format
    elif uri.startswith("data"):
        # data:image/{format};base64,{data}
        match = re.match(r"data:image/(\w+);base64,(.+)", uri)
        if not match:
            raise ValueError(f"Invalid data URL format, {uri[:40]}...")
        fmt, image_data = match.groups()
        content = base64.b64decode(image_data)
        return content, fmt
    else:
        return None
--- a/llama_stack/providers/utils/tools/mcp.py
+++ b/llama_stack/providers/utils/tools/mcp.py
@ -120,6 +120,10 @@ async def list_mcp_tools(endpoint: str, headers: dict[str, str]) -> ListToolDefs
                        name=param_name,
                        parameter_type=param_schema.get("type", "string"),
                        description=param_schema.get("description", ""),
                        required="default" not in param_schema,
                        items=param_schema.get("items", None),
                        title=param_schema.get("title", None),
                        default=param_schema.get("default", None),
                    )
                )
            tools.append(
--- a/llama_stack/ui/package-lock.json
+++ b/llama_stack/ui/package-lock.json
@ -28,7 +28,7 @@
        "react-markdown": "^10.1.0",
        "remark-gfm": "^4.0.1",
        "remeda": "^2.32.0",
-        "shiki": "^1.29.2",
+        "shiki": "^3.13.0",
        "sonner": "^2.0.7",
        "tailwind-merge": "^3.3.1"
      },
@ -51,7 +51,7 @@
        "prettier": "3.6.2",
        "tailwindcss": "^4",
        "ts-node": "^10.9.2",
-        "tw-animate-css": "^1.2.9",
+        "tw-animate-css": "^1.4.0",
        "typescript": "^5"
      }
    },
@ -3250,65 +3250,63 @@
      "license": "MIT"
    },
    "node_modules/@shikijs/core": {
-      "version": "1.29.2",
+      "version": "3.13.0",
-      "resolved": "https://registry.npmjs.org/@shikijs/core/-/core-1.29.2.tgz",
+      "resolved": "https://registry.npmjs.org/@shikijs/core/-/core-3.13.0.tgz",
-      "integrity": "sha512-vju0lY9r27jJfOY4Z7+Rt/nIOjzJpZ3y+nYpqtUZInVoXQ/TJZcfGnNOGnKjFdVZb8qexiCuSlZRKcGfhhTTZQ==",
+      "integrity": "sha512-3P8rGsg2Eh2qIHekwuQjzWhKI4jV97PhvYjYUzGqjvJfqdQPz+nMlfWahU24GZAyW1FxFI1sYjyhfh5CoLmIUA==",
      "license": "MIT",
      "dependencies": {
-        "@shikijs/engine-javascript": "1.29.2",
+        "@shikijs/types": "3.13.0",
-        "@shikijs/engine-oniguruma": "1.29.2",
+        "@shikijs/vscode-textmate": "^10.0.2",
        "@shikijs/types": "1.29.2",
        "@shikijs/vscode-textmate": "^10.0.1",
        "@types/hast": "^3.0.4",
-        "hast-util-to-html": "^9.0.4"
+        "hast-util-to-html": "^9.0.5"
      }
    },
    "node_modules/@shikijs/engine-javascript": {
-      "version": "1.29.2",
+      "version": "3.13.0",
-      "resolved": "https://registry.npmjs.org/@shikijs/engine-javascript/-/engine-javascript-1.29.2.tgz",
+      "resolved": "https://registry.npmjs.org/@shikijs/engine-javascript/-/engine-javascript-3.13.0.tgz",
-      "integrity": "sha512-iNEZv4IrLYPv64Q6k7EPpOCE/nuvGiKl7zxdq0WFuRPF5PAE9PRo2JGq/d8crLusM59BRemJ4eOqrFrC4wiQ+A==",
+      "integrity": "sha512-Ty7xv32XCp8u0eQt8rItpMs6rU9Ki6LJ1dQOW3V/56PKDcpvfHPnYFbsx5FFUP2Yim34m/UkazidamMNVR4vKg==",
      "license": "MIT",
      "dependencies": {
-        "@shikijs/types": "1.29.2",
+        "@shikijs/types": "3.13.0",
-        "@shikijs/vscode-textmate": "^10.0.1",
+        "@shikijs/vscode-textmate": "^10.0.2",
-        "oniguruma-to-es": "^2.2.0"
+        "oniguruma-to-es": "^4.3.3"
      }
    },
    "node_modules/@shikijs/engine-oniguruma": {
-      "version": "1.29.2",
+      "version": "3.13.0",
-      "resolved": "https://registry.npmjs.org/@shikijs/engine-oniguruma/-/engine-oniguruma-1.29.2.tgz",
+      "resolved": "https://registry.npmjs.org/@shikijs/engine-oniguruma/-/engine-oniguruma-3.13.0.tgz",
-      "integrity": "sha512-7iiOx3SG8+g1MnlzZVDYiaeHe7Ez2Kf2HrJzdmGwkRisT7r4rak0e655AcM/tF9JG/kg5fMNYlLLKglbN7gBqA==",
+      "integrity": "sha512-O42rBGr4UDSlhT2ZFMxqM7QzIU+IcpoTMzb3W7AlziI1ZF7R8eS2M0yt5Ry35nnnTX/LTLXFPUjRFCIW+Operg==",
      "license": "MIT",
      "dependencies": {
-        "@shikijs/types": "1.29.2",
+        "@shikijs/types": "3.13.0",
-        "@shikijs/vscode-textmate": "^10.0.1"
+        "@shikijs/vscode-textmate": "^10.0.2"
      }
    },
    "node_modules/@shikijs/langs": {
-      "version": "1.29.2",
+      "version": "3.13.0",
-      "resolved": "https://registry.npmjs.org/@shikijs/langs/-/langs-1.29.2.tgz",
+      "resolved": "https://registry.npmjs.org/@shikijs/langs/-/langs-3.13.0.tgz",
-      "integrity": "sha512-FIBA7N3LZ+223U7cJDUYd5shmciFQlYkFXlkKVaHsCPgfVLiO+e12FmQE6Tf9vuyEsFe3dIl8qGWKXgEHL9wmQ==",
+      "integrity": "sha512-672c3WAETDYHwrRP0yLy3W1QYB89Hbpj+pO4KhxK6FzIrDI2FoEXNiNCut6BQmEApYLfuYfpgOZaqbY+E9b8wQ==",
      "license": "MIT",
      "dependencies": {
-        "@shikijs/types": "1.29.2"
+        "@shikijs/types": "3.13.0"
      }
    },
    "node_modules/@shikijs/themes": {
-      "version": "1.29.2",
+      "version": "3.13.0",
-      "resolved": "https://registry.npmjs.org/@shikijs/themes/-/themes-1.29.2.tgz",
+      "resolved": "https://registry.npmjs.org/@shikijs/themes/-/themes-3.13.0.tgz",
-      "integrity": "sha512-i9TNZlsq4uoyqSbluIcZkmPL9Bfi3djVxRnofUHwvx/h6SRW3cwgBC5SML7vsDcWyukY0eCzVN980rqP6qNl9g==",
+      "integrity": "sha512-Vxw1Nm1/Od8jyA7QuAenaV78BG2nSr3/gCGdBkLpfLscddCkzkL36Q5b67SrLLfvAJTOUzW39x4FHVCFriPVgg==",
      "license": "MIT",
      "dependencies": {
-        "@shikijs/types": "1.29.2"
+        "@shikijs/types": "3.13.0"
      }
    },
    "node_modules/@shikijs/types": {
-      "version": "1.29.2",
+      "version": "3.13.0",
-      "resolved": "https://registry.npmjs.org/@shikijs/types/-/types-1.29.2.tgz",
+      "resolved": "https://registry.npmjs.org/@shikijs/types/-/types-3.13.0.tgz",
-      "integrity": "sha512-VJjK0eIijTZf0QSTODEXCqinjBn0joAHQ+aPSBzrv4O2d/QSbsMw+ZeSRx03kV34Hy7NzUvV/7NqfYGRLrASmw==",
+      "integrity": "sha512-oM9P+NCFri/mmQ8LoFGVfVyemm5Hi27330zuOBp0annwJdKH1kOLndw3zCtAVDehPLg9fKqoEx3Ht/wNZxolfw==",
      "license": "MIT",
      "dependencies": {
-        "@shikijs/vscode-textmate": "^10.0.1",
+        "@shikijs/vscode-textmate": "^10.0.2",
        "@types/hast": "^3.0.4"
      }
    },
@ -6084,12 +6082,6 @@
      "dev": true,
      "license": "MIT"
    },
    "node_modules/emoji-regex-xs": {
      "version": "1.0.0",
      "resolved": "https://registry.npmjs.org/emoji-regex-xs/-/emoji-regex-xs-1.0.0.tgz",
      "integrity": "sha512-LRlerrMYoIDrT6jgpeZ2YYl/L8EulRTt5hQcYjy5AInh7HWXKimpqx68aknBFpGL2+/IcogTcaydJEgaTmOpDg==",
      "license": "MIT"
    },
    "node_modules/encodeurl": {
      "version": "2.0.0",
      "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz",
@ -11813,15 +11805,21 @@
        "url": "https://github.com/sponsors/sindresorhus"
      }
    },
    "node_modules/oniguruma-parser": {
      "version": "0.12.1",
      "resolved": "https://registry.npmjs.org/oniguruma-parser/-/oniguruma-parser-0.12.1.tgz",
      "integrity": "sha512-8Unqkvk1RYc6yq2WBYRj4hdnsAxVze8i7iPfQr8e4uSP3tRv0rpZcbGUDvxfQQcdwHt/e9PrMvGCsa8OqG9X3w==",
      "license": "MIT"
    },
    "node_modules/oniguruma-to-es": {
-      "version": "2.3.0",
+      "version": "4.3.3",
-      "resolved": "https://registry.npmjs.org/oniguruma-to-es/-/oniguruma-to-es-2.3.0.tgz",
+      "resolved": "https://registry.npmjs.org/oniguruma-to-es/-/oniguruma-to-es-4.3.3.tgz",
-      "integrity": "sha512-bwALDxriqfKGfUufKGGepCzu9x7nJQuoRoAFp4AnwehhC2crqrDIAP/uN2qdlsAvSMpeRC3+Yzhqc7hLmle5+g==",
+      "integrity": "sha512-rPiZhzC3wXwE59YQMRDodUwwT9FZ9nNBwQQfsd1wfdtlKEyCdRV0avrTcSZ5xlIvGRVPd/cx6ZN45ECmS39xvg==",
      "license": "MIT",
      "dependencies": {
-        "emoji-regex-xs": "^1.0.0",
+        "oniguruma-parser": "^0.12.1",
-        "regex": "^5.1.1",
+        "regex": "^6.0.1",
-        "regex-recursion": "^5.1.1"
+        "regex-recursion": "^6.0.2"
      }
    },
    "node_modules/openid-client": {
@ -12613,21 +12611,20 @@
      }
    },
    "node_modules/regex": {
-      "version": "5.1.1",
+      "version": "6.0.1",
-      "resolved": "https://registry.npmjs.org/regex/-/regex-5.1.1.tgz",
+      "resolved": "https://registry.npmjs.org/regex/-/regex-6.0.1.tgz",
-      "integrity": "sha512-dN5I359AVGPnwzJm2jN1k0W9LPZ+ePvoOeVMMfqIMFz53sSwXkxaJoxr50ptnsC771lK95BnTrVSZxq0b9yCGw==",
+      "integrity": "sha512-uorlqlzAKjKQZ5P+kTJr3eeJGSVroLKoHmquUj4zHWuR+hEyNqlXsSKlYYF5F4NI6nl7tWCs0apKJ0lmfsXAPA==",
      "license": "MIT",
      "dependencies": {
        "regex-utilities": "^2.3.0"
      }
    },
    "node_modules/regex-recursion": {
-      "version": "5.1.1",
+      "version": "6.0.2",
-      "resolved": "https://registry.npmjs.org/regex-recursion/-/regex-recursion-5.1.1.tgz",
+      "resolved": "https://registry.npmjs.org/regex-recursion/-/regex-recursion-6.0.2.tgz",
-      "integrity": "sha512-ae7SBCbzVNrIjgSbh7wMznPcQel1DNlDtzensnFxpiNpXt1U2ju/bHugH422r+4LAVS1FpW1YCwilmnNsjum9w==",
+      "integrity": "sha512-0YCaSCq2VRIebiaUviZNs0cBz1kg5kVS2UKUfNIx8YVs1cN3AV7NTctO5FOKBA+UT2BPJIWZauYHPqJODG50cg==",
      "license": "MIT",
      "dependencies": {
        "regex": "^5.1.1",
        "regex-utilities": "^2.3.0"
      }
    },
@ -13165,18 +13162,18 @@
      }
    },
    "node_modules/shiki": {
-      "version": "1.29.2",
+      "version": "3.13.0",
-      "resolved": "https://registry.npmjs.org/shiki/-/shiki-1.29.2.tgz",
+      "resolved": "https://registry.npmjs.org/shiki/-/shiki-3.13.0.tgz",
-      "integrity": "sha512-njXuliz/cP+67jU2hukkxCNuH1yUi4QfdZZY+sMr5PPrIyXSu5iTb/qYC4BiWWB0vZ+7TbdvYUCeL23zpwCfbg==",
+      "integrity": "sha512-aZW4l8Og16CokuCLf8CF8kq+KK2yOygapU5m3+hoGw0Mdosc6fPitjM+ujYarppj5ZIKGyPDPP1vqmQhr+5/0g==",
      "license": "MIT",
      "dependencies": {
-        "@shikijs/core": "1.29.2",
+        "@shikijs/core": "3.13.0",
-        "@shikijs/engine-javascript": "1.29.2",
+        "@shikijs/engine-javascript": "3.13.0",
-        "@shikijs/engine-oniguruma": "1.29.2",
+        "@shikijs/engine-oniguruma": "3.13.0",
-        "@shikijs/langs": "1.29.2",
+        "@shikijs/langs": "3.13.0",
-        "@shikijs/themes": "1.29.2",
+        "@shikijs/themes": "3.13.0",
-        "@shikijs/types": "1.29.2",
+        "@shikijs/types": "3.13.0",
-        "@shikijs/vscode-textmate": "^10.0.1",
+        "@shikijs/vscode-textmate": "^10.0.2",
        "@types/hast": "^3.0.4"
      }
    },
@ -13970,9 +13967,9 @@
      "license": "0BSD"
    },
    "node_modules/tw-animate-css": {
-      "version": "1.2.9",
+      "version": "1.4.0",
-      "resolved": "https://registry.npmjs.org/tw-animate-css/-/tw-animate-css-1.2.9.tgz",
+      "resolved": "https://registry.npmjs.org/tw-animate-css/-/tw-animate-css-1.4.0.tgz",
-      "integrity": "sha512-9O4k1at9pMQff9EAcCEuy1UNO43JmaPQvq+0lwza9Y0BQ6LB38NiMj+qHqjoQf40355MX+gs6wtlR6H9WsSXFg==",
+      "integrity": "sha512-7bziOlRqH0hJx80h/3mbicLW7o8qLsH5+RaLR2t+OHM3D0JlWGODQKQ4cxbK7WlvmUxpcj6Kgu6EKqjrGFe3QQ==",
      "dev": true,
      "license": "MIT",
      "funding": {
--- a/llama_stack/ui/package.json
+++ b/llama_stack/ui/package.json
@ -33,7 +33,7 @@
    "react-markdown": "^10.1.0",
    "remark-gfm": "^4.0.1",
    "remeda": "^2.32.0",
-    "shiki": "^1.29.2",
+    "shiki": "^3.13.0",
    "sonner": "^2.0.7",
    "tailwind-merge": "^3.3.1"
  },
@ -56,7 +56,7 @@
    "prettier": "3.6.2",
    "tailwindcss": "^4",
    "ts-node": "^10.9.2",
-    "tw-animate-css": "^1.2.9",
+    "tw-animate-css": "^1.4.0",
    "typescript": "^5"
  }
 }
--- a/tests/common/mcp.py
+++ b/tests/common/mcp.py
@ -167,6 +167,8 @@ def make_mcp_server(required_auth_token: str | None = None, tools: dict[str, Cal
    from starlette.responses import Response
    from starlette.routing import Mount, Route
    from llama_stack.log import get_logger
    server = FastMCP("FastMCP Test Server", log_level="WARNING")
    tools = tools or default_tools()
@ -211,6 +213,7 @@ def make_mcp_server(required_auth_token: str | None = None, tools: dict[str, Cal
            return sock.getsockname()[1]
    port = get_open_port()
    logger = get_logger(__name__, category="tests::mcp")
    # make uvicorn logs be less verbose
    config = uvicorn.Config(app, host="0.0.0.0", port=port, log_level="warning")
@ -218,10 +221,17 @@ def make_mcp_server(required_auth_token: str | None = None, tools: dict[str, Cal
    app.state.uvicorn_server = server_instance
    def run_server():
        try:
            logger.info(f"Starting MCP server on port {port}")
            server_instance.run()
            logger.info(f"MCP server on port {port} has stopped")
        except Exception as e:
            logger.error(f"MCP server failed to start on port {port}: {e}")
            raise
    # Start the server in a new thread
    server_thread = threading.Thread(target=run_server, daemon=True)
    logger.info(f"Starting MCP server thread on port {port}")
    server_thread.start()
    # Polling until the server is ready
@ -229,24 +239,36 @@ def make_mcp_server(required_auth_token: str | None = None, tools: dict[str, Cal
    start_time = time.time()
    server_url = f"http://localhost:{port}/sse"
    logger.info(f"Waiting for MCP server to be ready at {server_url}")
    while time.time() - start_time < timeout:
        try:
            response = httpx.get(server_url)
            if response.status_code in [200, 401]:
                logger.info(f"MCP server is ready on port {port} (status: {response.status_code})")
                break
-        except httpx.RequestError:
+        except httpx.RequestError as e:
            logger.debug(f"Server not ready yet, retrying... ({e})")
            pass
        time.sleep(0.1)
    else:
        # If we exit the loop due to timeout
        logger.error(f"MCP server failed to start within {timeout} seconds on port {port}")
        logger.error(f"Thread alive: {server_thread.is_alive()}")
        if server_thread.is_alive():
            logger.error("Server thread is still running but not responding to HTTP requests")
    try:
        yield {"server_url": server_url}
    finally:
        logger.info(f"Shutting down MCP server on port {port}")
        server_instance.should_exit = True
        time.sleep(0.5)
        # Force shutdown if still running
        if server_thread.is_alive():
            try:
                logger.info("Force shutting down server thread")
                if hasattr(server_instance, "servers") and server_instance.servers:
                    for srv in server_instance.servers:
                        srv.close()
@ -254,9 +276,9 @@ def make_mcp_server(required_auth_token: str | None = None, tools: dict[str, Cal
                # Wait for graceful shutdown
                server_thread.join(timeout=3)
                if server_thread.is_alive():
-                    print("Warning: Server thread still alive after shutdown attempt")
+                    logger.warning("Server thread still alive after shutdown attempt")
            except Exception as e:
-                print(f"Error during server shutdown: {e}")
+                logger.error(f"Error during server shutdown: {e}")
        # CRITICAL: Reset SSE global state to prevent event loop contamination
        # Reset the SSE AppStatus singleton that stores anyio.Event objects
--- a/tests/integration/inference/test_batch_inference.py
+++ b/tests/integration/inference/test_batch_inference.py
@ -1,76 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import pytest
 from ..test_cases.test_case import TestCase
 def skip_if_provider_doesnt_support_batch_inference(client_with_models, model_id):
    models = {m.identifier: m for m in client_with_models.models.list()}
    models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
    provider_id = models[model_id].provider_id
    providers = {p.provider_id: p for p in client_with_models.providers.list()}
    provider = providers[provider_id]
    if provider.provider_type not in ("inline::meta-reference",):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support batch inference")
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:completion:batch_completion",
    ],
 )
 def test_batch_completion_non_streaming(client_with_models, text_model_id, test_case):
    skip_if_provider_doesnt_support_batch_inference(client_with_models, text_model_id)
    tc = TestCase(test_case)
    content_batch = tc["contents"]
    response = client_with_models.inference.batch_completion(
        content_batch=content_batch,
        model_id=text_model_id,
        sampling_params={
            "max_tokens": 50,
        },
    )
    assert len(response.batch) == len(content_batch)
    for i, r in enumerate(response.batch):
        print(f"response {i}: {r.content}")
        assert len(r.content) > 10
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:chat_completion:batch_completion",
    ],
 )
 def test_batch_chat_completion_non_streaming(client_with_models, text_model_id, test_case):
    skip_if_provider_doesnt_support_batch_inference(client_with_models, text_model_id)
    tc = TestCase(test_case)
    qa_pairs = tc["qa_pairs"]
    message_batch = [
        [
            {
                "role": "user",
                "content": qa["question"],
            }
        ]
        for qa in qa_pairs
    ]
    response = client_with_models.inference.batch_chat_completion(
        messages_batch=message_batch,
        model_id=text_model_id,
    )
    assert len(response.batch) == len(qa_pairs)
    for i, r in enumerate(response.batch):
        print(f"response {i}: {r.completion_message.content}")
        assert len(r.completion_message.content) > 0
        assert qa_pairs[i]["answer"].lower() in r.completion_message.content.lower()
--- a/tests/integration/inference/test_embedding.py
+++ b/tests/integration/inference/test_embedding.py
@ -1,303 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 #
 # Test plan:
 #
 #  Types of input:
 #   - array of a string
 #   - array of a image (ImageContentItem, either URL or base64 string)
 #   - array of a text (TextContentItem)
 #  Types of output:
 #   - list of list of floats
 #  Params:
 #   - text_truncation
 #     - absent w/ long text -> error
 #     - none w/ long text -> error
 #     - absent w/ short text -> ok
 #     - none w/ short text -> ok
 #     - end w/ long text -> ok
 #     - end w/ short text -> ok
 #     - start w/ long text -> ok
 #     - start w/ short text -> ok
 #   - output_dimension
 #     - response dimension matches
 #   - task_type, only for asymmetric models
 #     - query embedding != passage embedding
 #  Negative:
 #   - long string
 #   - long text
 #
 # Todo:
 #  - negative tests
 #    - empty
 #      - empty list
 #      - empty string
 #      - empty text
 #      - empty image
 #    - long
 #      - large image
 #      - appropriate combinations
 #    - batch size
 #      - many inputs
 #    - invalid
 #      - invalid URL
 #      - invalid base64
 #
 # Notes:
 #  - use llama_stack_client fixture
 #  - use pytest.mark.parametrize when possible
 #  - no accuracy tests: only check the type of output, not the content
 #
 import pytest
 from llama_stack_client import BadRequestError as LlamaStackBadRequestError
 from llama_stack_client.types import EmbeddingsResponse
 from llama_stack_client.types.shared.interleaved_content import (
    ImageContentItem,
    ImageContentItemImage,
    ImageContentItemImageURL,
    TextContentItem,
 )
 from openai import BadRequestError as OpenAIBadRequestError
 from llama_stack.core.library_client import LlamaStackAsLibraryClient
 DUMMY_STRING = "hello"
 DUMMY_STRING2 = "world"
 DUMMY_LONG_STRING = "NVDA " * 10240
 DUMMY_TEXT = TextContentItem(text=DUMMY_STRING, type="text")
 DUMMY_TEXT2 = TextContentItem(text=DUMMY_STRING2, type="text")
 DUMMY_LONG_TEXT = TextContentItem(text=DUMMY_LONG_STRING, type="text")
 # TODO(mf): add a real image URL and base64 string
 DUMMY_IMAGE_URL = ImageContentItem(
    image=ImageContentItemImage(url=ImageContentItemImageURL(uri="https://example.com/image.jpg")), type="image"
 )
 DUMMY_IMAGE_BASE64 = ImageContentItem(image=ImageContentItemImage(data="base64string"), type="image")
 SUPPORTED_PROVIDERS = {"remote::nvidia"}
 MODELS_SUPPORTING_MEDIA = {}
 MODELS_SUPPORTING_OUTPUT_DIMENSION = {"nvidia/llama-3.2-nv-embedqa-1b-v2"}
 MODELS_REQUIRING_TASK_TYPE = {
    "nvidia/llama-3.2-nv-embedqa-1b-v2",
    "nvidia/nv-embedqa-e5-v5",
    "nvidia/nv-embedqa-mistral-7b-v2",
    "snowflake/arctic-embed-l",
 }
 MODELS_SUPPORTING_TASK_TYPE = MODELS_REQUIRING_TASK_TYPE
 def default_task_type(model_id):
    """
    Some models require a task type parameter. This provides a default value for
    testing those models.
    """
    if model_id in MODELS_REQUIRING_TASK_TYPE:
        return {"task_type": "query"}
    return {}
@pytest.mark.parametrize(
    "contents",
    [
        [DUMMY_STRING, DUMMY_STRING2],
        [DUMMY_TEXT, DUMMY_TEXT2],
    ],
    ids=[
        "list[string]",
        "list[text]",
    ],
 )
 def test_embedding_text(llama_stack_client, embedding_model_id, contents, inference_provider_type):
    if inference_provider_type not in SUPPORTED_PROVIDERS:
        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
    response = llama_stack_client.inference.embeddings(
        model_id=embedding_model_id, contents=contents, **default_task_type(embedding_model_id)
    )
    assert isinstance(response, EmbeddingsResponse)
    assert len(response.embeddings) == sum(len(content) if isinstance(content, list) else 1 for content in contents)
    assert isinstance(response.embeddings[0], list)
    assert isinstance(response.embeddings[0][0], float)
@pytest.mark.parametrize(
    "contents",
    [
        [DUMMY_IMAGE_URL, DUMMY_IMAGE_BASE64],
        [DUMMY_IMAGE_URL, DUMMY_STRING, DUMMY_IMAGE_BASE64, DUMMY_TEXT],
    ],
    ids=[
        "list[url,base64]",
        "list[url,string,base64,text]",
    ],
 )
 def test_embedding_image(llama_stack_client, embedding_model_id, contents, inference_provider_type):
    if inference_provider_type not in SUPPORTED_PROVIDERS:
        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
    if embedding_model_id not in MODELS_SUPPORTING_MEDIA:
        pytest.xfail(f"{embedding_model_id} doesn't support media")
    response = llama_stack_client.inference.embeddings(
        model_id=embedding_model_id, contents=contents, **default_task_type(embedding_model_id)
    )
    assert isinstance(response, EmbeddingsResponse)
    assert len(response.embeddings) == sum(len(content) if isinstance(content, list) else 1 for content in contents)
    assert isinstance(response.embeddings[0], list)
    assert isinstance(response.embeddings[0][0], float)
@pytest.mark.parametrize(
    "text_truncation",
    [
        "end",
        "start",
    ],
 )
@pytest.mark.parametrize(
    "contents",
    [
        [DUMMY_LONG_TEXT],
        [DUMMY_STRING],
    ],
    ids=[
        "long",
        "short",
    ],
 )
 def test_embedding_truncation(
    llama_stack_client, embedding_model_id, text_truncation, contents, inference_provider_type
 ):
    if inference_provider_type not in SUPPORTED_PROVIDERS:
        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
    response = llama_stack_client.inference.embeddings(
        model_id=embedding_model_id,
        contents=contents,
        text_truncation=text_truncation,
        **default_task_type(embedding_model_id),
    )
    assert isinstance(response, EmbeddingsResponse)
    assert len(response.embeddings) == 1
    assert isinstance(response.embeddings[0], list)
    assert isinstance(response.embeddings[0][0], float)
@pytest.mark.parametrize(
    "text_truncation",
    [
        None,
        "none",
    ],
 )
@pytest.mark.parametrize(
    "contents",
    [
        [DUMMY_LONG_TEXT],
        [DUMMY_LONG_STRING],
    ],
    ids=[
        "long-text",
        "long-str",
    ],
 )
 def test_embedding_truncation_error(
    llama_stack_client, embedding_model_id, text_truncation, contents, inference_provider_type
 ):
    if inference_provider_type not in SUPPORTED_PROVIDERS:
        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
    # Using LlamaStackClient from llama_stack_client will raise llama_stack_client.BadRequestError
    # While using LlamaStackAsLibraryClient from llama_stack.distribution.library_client will raise the error that the backend raises
    error_type = (
        OpenAIBadRequestError
        if isinstance(llama_stack_client, LlamaStackAsLibraryClient)
        else LlamaStackBadRequestError
    )
    with pytest.raises(error_type):
        llama_stack_client.inference.embeddings(
            model_id=embedding_model_id,
            contents=[DUMMY_LONG_TEXT],
            text_truncation=text_truncation,
            **default_task_type(embedding_model_id),
        )
 def test_embedding_output_dimension(llama_stack_client, embedding_model_id, inference_provider_type):
    if inference_provider_type not in SUPPORTED_PROVIDERS:
        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
    if embedding_model_id not in MODELS_SUPPORTING_OUTPUT_DIMENSION:
        pytest.xfail(f"{embedding_model_id} doesn't support output_dimension")
    base_response = llama_stack_client.inference.embeddings(
        model_id=embedding_model_id, contents=[DUMMY_STRING], **default_task_type(embedding_model_id)
    )
    test_response = llama_stack_client.inference.embeddings(
        model_id=embedding_model_id,
        contents=[DUMMY_STRING],
        **default_task_type(embedding_model_id),
        output_dimension=32,
    )
    assert len(base_response.embeddings[0]) != len(test_response.embeddings[0])
    assert len(test_response.embeddings[0]) == 32
 def test_embedding_task_type(llama_stack_client, embedding_model_id, inference_provider_type):
    if inference_provider_type not in SUPPORTED_PROVIDERS:
        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
    if embedding_model_id not in MODELS_SUPPORTING_TASK_TYPE:
        pytest.xfail(f"{embedding_model_id} doesn't support task_type")
    query_embedding = llama_stack_client.inference.embeddings(
        model_id=embedding_model_id, contents=[DUMMY_STRING], task_type="query"
    )
    document_embedding = llama_stack_client.inference.embeddings(
        model_id=embedding_model_id, contents=[DUMMY_STRING], task_type="document"
    )
    assert query_embedding.embeddings != document_embedding.embeddings
@pytest.mark.parametrize(
    "text_truncation",
    [
        None,
        "none",
        "end",
        "start",
    ],
 )
 def test_embedding_text_truncation(llama_stack_client, embedding_model_id, text_truncation, inference_provider_type):
    if inference_provider_type not in SUPPORTED_PROVIDERS:
        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
    response = llama_stack_client.inference.embeddings(
        model_id=embedding_model_id,
        contents=[DUMMY_STRING],
        text_truncation=text_truncation,
        **default_task_type(embedding_model_id),
    )
    assert isinstance(response, EmbeddingsResponse)
    assert len(response.embeddings) == 1
    assert isinstance(response.embeddings[0], list)
    assert isinstance(response.embeddings[0][0], float)
@pytest.mark.parametrize(
    "text_truncation",
    [
        "NONE",
        "END",
        "START",
        "left",
        "right",
    ],
 )
 def test_embedding_text_truncation_error(
    llama_stack_client, embedding_model_id, text_truncation, inference_provider_type
 ):
    if inference_provider_type not in SUPPORTED_PROVIDERS:
        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
    error_type = ValueError if isinstance(llama_stack_client, LlamaStackAsLibraryClient) else LlamaStackBadRequestError
    with pytest.raises(error_type):
        llama_stack_client.inference.embeddings(
            model_id=embedding_model_id,
            contents=[DUMMY_STRING],
            text_truncation=text_truncation,
            **default_task_type(embedding_model_id),
        )
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@ -9,6 +9,7 @@ import time
 import unicodedata
 import pytest
 from pydantic import BaseModel
 from ..test_cases.test_case import TestCase
@ -62,6 +63,14 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
 def skip_if_doesnt_support_completions_logprobs(client_with_models, model_id):
    provider_type = provider_from_model(client_with_models, model_id).provider_type
    if provider_type in (
        "remote::ollama",  # logprobs is ignored
    ):
        pytest.skip(f"Model {model_id} hosted by {provider_type} doesn't support /v1/completions logprobs.")
 def skip_if_model_doesnt_support_suffix(client_with_models, model_id):
    # To test `fim` ( fill in the middle ) completion, we need to use a model that supports suffix.
    # Use this to specifically test this API functionality.
@ -205,28 +214,6 @@ def test_openai_completion_streaming(llama_stack_client, client_with_models, tex
    assert len(content_str) > 10
@pytest.mark.parametrize(
    "prompt_logprobs",
    [
        1,
        0,
    ],
 )
 def test_openai_completion_prompt_logprobs(llama_stack_client, client_with_models, text_model_id, prompt_logprobs):
    skip_if_provider_isnt_vllm(client_with_models, text_model_id)
    prompt = "Hello, world!"
    response = llama_stack_client.completions.create(
        model=text_model_id,
        prompt=prompt,
        stream=False,
        prompt_logprobs=prompt_logprobs,
    )
    assert len(response.choices) > 0
    choice = response.choices[0]
    assert len(choice.prompt_logprobs) > 0
 def test_openai_completion_guided_choice(llama_stack_client, client_with_models, text_model_id):
    skip_if_provider_isnt_vllm(client_with_models, text_model_id)
@ -518,3 +505,214 @@ def test_openai_chat_completion_non_streaming_with_file(openai_client, client_wi
    message_content = response.choices[0].message.content.lower().strip()
    normalized_content = _normalize_text(message_content)
    assert "hello world" in normalized_content
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:completion:stop_sequence",
    ],
 )
 def test_openai_completion_stop_sequence(client_with_models, openai_client, text_model_id, test_case):
    skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
    tc = TestCase(test_case)
    response = openai_client.completions.create(
        model=text_model_id,
        prompt=tc["content"],
        stop="1963",
        stream=False,
    )
    assert len(response.choices) > 0
    choice = response.choices[0]
    assert "1963" not in choice.text
    response = openai_client.completions.create(
        model=text_model_id,
        prompt=tc["content"],
        stop=["blathering", "1963"],
        stream=False,
    )
    assert len(response.choices) > 0
    choice = response.choices[0]
    assert "1963" not in choice.text
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:completion:log_probs",
    ],
 )
 def test_openai_completion_logprobs(client_with_models, openai_client, text_model_id, test_case):
    skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
    skip_if_doesnt_support_completions_logprobs(client_with_models, text_model_id)
    tc = TestCase(test_case)
    response = openai_client.completions.create(
        model=text_model_id,
        prompt=tc["content"],
        logprobs=5,
    )
    assert len(response.choices) > 0
    choice = response.choices[0]
    assert choice.text, "Response text should not be empty"
    assert choice.logprobs, "Logprobs should not be empty"
    logprobs = choice.logprobs
    assert logprobs.token_logprobs, "Response tokens should not be empty"
    assert len(logprobs.tokens) == len(logprobs.token_logprobs)
    assert len(logprobs.token_logprobs) == len(logprobs.top_logprobs)
    for i, (token, prob) in enumerate(zip(logprobs.tokens, logprobs.token_logprobs, strict=True)):
        assert logprobs.top_logprobs[i][token] == prob
        assert len(logprobs.top_logprobs[i]) == 5
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:completion:log_probs",
    ],
 )
 def test_openai_completion_logprobs_streaming(client_with_models, openai_client, text_model_id, test_case):
    skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
    skip_if_doesnt_support_completions_logprobs(client_with_models, text_model_id)
    tc = TestCase(test_case)
    response = openai_client.completions.create(
        model=text_model_id,
        prompt=tc["content"],
        logprobs=3,
        stream=True,
        max_tokens=5,
    )
    for chunk in response:
        choice = chunk.choices[0]
        choice = response.choices[0]
        if choice.text:  # if there's a token, we expect logprobs
            assert choice.logprobs, "Logprobs should not be empty"
            logprobs = choice.logprobs
            assert logprobs.token_logprobs, "Response tokens should not be empty"
            assert len(logprobs.tokens) == len(logprobs.token_logprobs)
            assert len(logprobs.token_logprobs) == len(logprobs.top_logprobs)
            for i, (token, prob) in enumerate(zip(logprobs.tokens, logprobs.token_logprobs, strict=True)):
                assert logprobs.top_logprobs[i][token] == prob
                assert len(logprobs.top_logprobs[i]) == 3
        else:  # no token, no logprobs
            assert not choice.logprobs, "Logprobs should be empty"
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:chat_completion:tool_calling",
    ],
 )
 def test_openai_chat_completion_with_tools(openai_client, text_model_id, test_case):
    tc = TestCase(test_case)
    response = openai_client.chat.completions.create(
        model=text_model_id,
        messages=tc["messages"],
        tools=tc["tools"],
        tool_choice="auto",
        stream=False,
    )
    assert len(response.choices) == 1
    assert len(response.choices[0].message.tool_calls) == 1
    tool_call = response.choices[0].message.tool_calls[0]
    assert tool_call.function.name == tc["tools"][0]["function"]["name"]
    assert "location" in tool_call.function.arguments
    assert tc["expected"]["location"] in tool_call.function.arguments
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:chat_completion:tool_calling",
    ],
 )
 def test_openai_chat_completion_with_tools_and_streaming(openai_client, text_model_id, test_case):
    tc = TestCase(test_case)
    response = openai_client.chat.completions.create(
        model=text_model_id,
        messages=tc["messages"],
        tools=tc["tools"],
        tool_choice="auto",
        stream=True,
    )
    # Accumulate tool calls from streaming chunks
    tool_calls = []
    for chunk in response:
        if chunk.choices and chunk.choices[0].delta.tool_calls:
            for i, tc_delta in enumerate(chunk.choices[0].delta.tool_calls):
                while len(tool_calls) <= i:
                    tool_calls.append({"function": {"name": "", "arguments": ""}})
                if tc_delta.function and tc_delta.function.name:
                    tool_calls[i]["function"]["name"] = tc_delta.function.name
                if tc_delta.function and tc_delta.function.arguments:
                    tool_calls[i]["function"]["arguments"] += tc_delta.function.arguments
    assert len(tool_calls) == 1
    tool_call = tool_calls[0]
    assert tool_call["function"]["name"] == tc["tools"][0]["function"]["name"]
    assert "location" in tool_call["function"]["arguments"]
    assert tc["expected"]["location"] in tool_call["function"]["arguments"]
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:chat_completion:tool_calling",
    ],
 )
 def test_openai_chat_completion_with_tool_choice_none(openai_client, text_model_id, test_case):
    tc = TestCase(test_case)
    response = openai_client.chat.completions.create(
        model=text_model_id,
        messages=tc["messages"],
        tools=tc["tools"],
        tool_choice="none",
        stream=False,
    )
    assert len(response.choices) == 1
    tool_calls = response.choices[0].message.tool_calls
    assert tool_calls is None or len(tool_calls) == 0
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:chat_completion:structured_output",
    ],
 )
 def test_openai_chat_completion_structured_output(openai_client, text_model_id, test_case):
    # Note: Skip condition may need adjustment for OpenAI client
    class AnswerFormat(BaseModel):
        first_name: str
        last_name: str
        year_of_birth: int
    tc = TestCase(test_case)
    response = openai_client.chat.completions.create(
        model=text_model_id,
        messages=tc["messages"],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "AnswerFormat",
                "schema": AnswerFormat.model_json_schema(),
            },
        },
        stream=False,
    )
    print(response.choices[0].message.content)
    answer = AnswerFormat.model_validate_json(response.choices[0].message.content)
    expected = tc["expected"]
    assert answer.first_name == expected["first_name"]
    assert answer.last_name == expected["last_name"]
    assert answer.year_of_birth == expected["year_of_birth"]
--- a/tests/integration/inference/test_openai_vision_inference.py
+++ b/tests/integration/inference/test_openai_vision_inference.py
@ -0,0 +1,77 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import base64
 import pathlib
 import pytest
@pytest.fixture
 def image_path():
    return pathlib.Path(__file__).parent / "dog.png"
@pytest.fixture
 def base64_image_data(image_path):
    return base64.b64encode(image_path.read_bytes()).decode("utf-8")
 async def test_openai_chat_completion_image_url(openai_client, vision_model_id):
    message = {
        "role": "user",
        "content": [
            {
                "type": "image_url",
                "image_url": {
                    "url": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
                },
            },
            {
                "type": "text",
                "text": "Describe what is in this image.",
            },
        ],
    }
    response = openai_client.chat.completions.create(
        model=vision_model_id,
        messages=[message],
        stream=False,
    )
    message_content = response.choices[0].message.content.lower().strip()
    assert len(message_content) > 0
    assert any(expected in message_content for expected in {"dog", "puppy", "pup"})
 async def test_openai_chat_completion_image_data(openai_client, vision_model_id, base64_image_data):
    message = {
        "role": "user",
        "content": [
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{base64_image_data}",
                },
            },
            {
                "type": "text",
                "text": "Describe what is in this image.",
            },
        ],
    }
    response = openai_client.chat.completions.create(
        model=vision_model_id,
        messages=[message],
        stream=False,
    )
    message_content = response.choices[0].message.content.lower().strip()
    assert len(message_content) > 0
    assert any(expected in message_content for expected in {"dog", "puppy", "pup"})
--- a/tests/integration/inference/test_text_inference.py
+++ b/tests/integration/inference/test_text_inference.py
@ -1,545 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from time import sleep
 import pytest
 from pydantic import BaseModel
 from llama_stack.models.llama.sku_list import resolve_model
 from ..test_cases.test_case import TestCase
 PROVIDER_LOGPROBS_TOP_K = {"remote::together", "remote::fireworks", "remote::vllm"}
 def skip_if_model_doesnt_support_completion(client_with_models, model_id):
    models = {m.identifier: m for m in client_with_models.models.list()}
    models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
    provider_id = models[model_id].provider_id
    providers = {p.provider_id: p for p in client_with_models.providers.list()}
    provider = providers[provider_id]
    if (
        provider.provider_type
        in (
            "remote::openai",
            "remote::anthropic",
            "remote::gemini",
            "remote::vertexai",
            "remote::groq",
            "remote::sambanova",
            "remote::azure",
        )
        or "openai-compat" in provider.provider_type
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support completion")
 def skip_if_model_doesnt_support_json_schema_structured_output(client_with_models, model_id):
    models = {m.identifier: m for m in client_with_models.models.list()}
    models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
    provider_id = models[model_id].provider_id
    providers = {p.provider_id: p for p in client_with_models.providers.list()}
    provider = providers[provider_id]
    if provider.provider_type in ("remote::sambanova", "remote::azure", "remote::watsonx"):
        pytest.skip(
            f"Model {model_id} hosted by {provider.provider_type} doesn't support json_schema structured output"
        )
 def get_llama_model(client_with_models, model_id):
    models = {}
    for m in client_with_models.models.list():
        models[m.identifier] = m
        models[m.provider_resource_id] = m
    assert model_id in models, f"Model {model_id} not found"
    model = models[model_id]
    ids = (model.identifier, model.provider_resource_id)
    for mid in ids:
        if resolve_model(mid):
            return mid
    return model.metadata.get("llama_model", None)
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:completion:sanity",
    ],
 )
 def test_text_completion_non_streaming(client_with_models, text_model_id, test_case):
    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
    tc = TestCase(test_case)
    response = client_with_models.inference.completion(
        content=tc["content"],
        stream=False,
        model_id=text_model_id,
        sampling_params={
            "max_tokens": 50,
        },
    )
    assert len(response.content) > 10
    # assert "blue" in response.content.lower().strip()
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:completion:sanity",
    ],
 )
 def test_text_completion_streaming(client_with_models, text_model_id, test_case):
    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
    tc = TestCase(test_case)
    response = client_with_models.inference.completion(
        content=tc["content"],
        stream=True,
        model_id=text_model_id,
        sampling_params={
            "max_tokens": 50,
        },
    )
    streamed_content = [chunk.delta for chunk in response]
    content_str = "".join(streamed_content).lower().strip()
    # assert "blue" in content_str
    assert len(content_str) > 10
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:completion:stop_sequence",
    ],
 )
 def test_text_completion_stop_sequence(client_with_models, text_model_id, inference_provider_type, test_case):
    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
    # This is only supported/tested for remote vLLM: https://github.com/meta-llama/llama-stack/issues/1771
    if inference_provider_type != "remote::vllm":
        pytest.xfail(f"{inference_provider_type} doesn't support 'stop' parameter yet")
    tc = TestCase(test_case)
    response = client_with_models.inference.completion(
        content=tc["content"],
        stream=True,
        model_id=text_model_id,
        sampling_params={
            "max_tokens": 50,
            "stop": ["1963"],
        },
    )
    streamed_content = [chunk.delta for chunk in response]
    content_str = "".join(streamed_content).lower().strip()
    assert "1963" not in content_str
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:completion:log_probs",
    ],
 )
 def test_text_completion_log_probs_non_streaming(client_with_models, text_model_id, inference_provider_type, test_case):
    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
    if inference_provider_type not in PROVIDER_LOGPROBS_TOP_K:
        pytest.xfail(f"{inference_provider_type} doesn't support log probs yet")
    tc = TestCase(test_case)
    response = client_with_models.inference.completion(
        content=tc["content"],
        stream=False,
        model_id=text_model_id,
        sampling_params={
            "max_tokens": 5,
        },
        logprobs={
            "top_k": 1,
        },
    )
    assert response.logprobs, "Logprobs should not be empty"
    assert 1 <= len(response.logprobs) <= 5  # each token has 1 logprob and here max_tokens=5
    assert all(len(logprob.logprobs_by_token) == 1 for logprob in response.logprobs)
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:completion:log_probs",
    ],
 )
 def test_text_completion_log_probs_streaming(client_with_models, text_model_id, inference_provider_type, test_case):
    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
    if inference_provider_type not in PROVIDER_LOGPROBS_TOP_K:
        pytest.xfail(f"{inference_provider_type} doesn't support log probs yet")
    tc = TestCase(test_case)
    response = client_with_models.inference.completion(
        content=tc["content"],
        stream=True,
        model_id=text_model_id,
        sampling_params={
            "max_tokens": 5,
        },
        logprobs={
            "top_k": 1,
        },
    )
    streamed_content = list(response)
    for chunk in streamed_content:
        if chunk.delta:  # if there's a token, we expect logprobs
            assert chunk.logprobs, "Logprobs should not be empty"
            assert all(len(logprob.logprobs_by_token) == 1 for logprob in chunk.logprobs)
        else:  # no token, no logprobs
            assert not chunk.logprobs, "Logprobs should be empty"
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:completion:structured_output",
    ],
 )
 def test_text_completion_structured_output(client_with_models, text_model_id, test_case):
    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
    skip_if_model_doesnt_support_json_schema_structured_output(client_with_models, text_model_id)
    class AnswerFormat(BaseModel):
        name: str
        year_born: str
        year_retired: str
    tc = TestCase(test_case)
    user_input = tc["user_input"]
    response = client_with_models.inference.completion(
        model_id=text_model_id,
        content=user_input,
        stream=False,
        sampling_params={
            "max_tokens": 50,
        },
        response_format={
            "type": "json_schema",
            "json_schema": AnswerFormat.model_json_schema(),
        },
    )
    answer = AnswerFormat.model_validate_json(response.content)
    expected = tc["expected"]
    assert answer.name == expected["name"]
    assert answer.year_born == expected["year_born"]
    assert answer.year_retired == expected["year_retired"]
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:chat_completion:non_streaming_01",
        "inference:chat_completion:non_streaming_02",
    ],
 )
 def test_text_chat_completion_non_streaming(client_with_models, text_model_id, test_case):
    tc = TestCase(test_case)
    question = tc["question"]
    expected = tc["expected"]
    response = client_with_models.inference.chat_completion(
        model_id=text_model_id,
        messages=[
            {
                "role": "user",
                "content": question,
            }
        ],
        stream=False,
    )
    message_content = response.completion_message.content.lower().strip()
    assert len(message_content) > 0
    assert expected.lower() in message_content
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:chat_completion:streaming_01",
        "inference:chat_completion:streaming_02",
    ],
 )
 def test_text_chat_completion_streaming(client_with_models, text_model_id, test_case):
    tc = TestCase(test_case)
    question = tc["question"]
    expected = tc["expected"]
    response = client_with_models.inference.chat_completion(
        model_id=text_model_id,
        messages=[{"role": "user", "content": question}],
        stream=True,
        timeout=120,  # Increase timeout to 2 minutes for large conversation history
    )
    streamed_content = [str(chunk.event.delta.text.lower().strip()) for chunk in response]
    assert len(streamed_content) > 0
    assert expected.lower() in "".join(streamed_content)
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:chat_completion:tool_calling",
    ],
 )
 def test_text_chat_completion_with_tool_calling_and_non_streaming(client_with_models, text_model_id, test_case):
    tc = TestCase(test_case)
    response = client_with_models.inference.chat_completion(
        model_id=text_model_id,
        messages=tc["messages"],
        tools=tc["tools"],
        tool_choice="auto",
        stream=False,
    )
    # some models can return content for the response in addition to the tool call
    assert response.completion_message.role == "assistant"
    assert len(response.completion_message.tool_calls) == 1
    assert response.completion_message.tool_calls[0].tool_name == tc["tools"][0]["tool_name"]
    assert response.completion_message.tool_calls[0].arguments == tc["expected"]
 # Will extract streamed text and separate it from tool invocation content
 # The returned tool inovcation content will be a string so it's easy to comapare with expected value
 # e.g. "[get_weather, {'location': 'San Francisco, CA'}]"
 def extract_tool_invocation_content(response):
    tool_invocation_content: str = ""
    for chunk in response:
        delta = chunk.event.delta
        if delta.type == "tool_call" and delta.parse_status == "succeeded":
            call = delta.tool_call
            tool_invocation_content += f"[{call.tool_name}, {call.arguments}]"
    return tool_invocation_content
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:chat_completion:tool_calling",
    ],
 )
 def test_text_chat_completion_with_tool_calling_and_streaming(client_with_models, text_model_id, test_case):
    tc = TestCase(test_case)
    response = client_with_models.inference.chat_completion(
        model_id=text_model_id,
        messages=tc["messages"],
        tools=tc["tools"],
        tool_choice="auto",
        stream=True,
    )
    tool_invocation_content = extract_tool_invocation_content(response)
    expected_tool_name = tc["tools"][0]["tool_name"]
    expected_argument = tc["expected"]
    assert tool_invocation_content == f"[{expected_tool_name}, {expected_argument}]"
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:chat_completion:tool_calling",
    ],
 )
 def test_text_chat_completion_with_tool_choice_required(client_with_models, text_model_id, test_case):
    tc = TestCase(test_case)
    response = client_with_models.inference.chat_completion(
        model_id=text_model_id,
        messages=tc["messages"],
        tools=tc["tools"],
        tool_config={
            "tool_choice": "required",
        },
        stream=True,
    )
    tool_invocation_content = extract_tool_invocation_content(response)
    expected_tool_name = tc["tools"][0]["tool_name"]
    expected_argument = tc["expected"]
    assert tool_invocation_content == f"[{expected_tool_name}, {expected_argument}]"
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:chat_completion:tool_calling",
    ],
 )
 def test_text_chat_completion_with_tool_choice_none(client_with_models, text_model_id, test_case):
    tc = TestCase(test_case)
    response = client_with_models.inference.chat_completion(
        model_id=text_model_id,
        messages=tc["messages"],
        tools=tc["tools"],
        tool_config={"tool_choice": "none"},
        stream=True,
    )
    tool_invocation_content = extract_tool_invocation_content(response)
    assert tool_invocation_content == ""
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:chat_completion:structured_output",
    ],
 )
 def test_text_chat_completion_structured_output(client_with_models, text_model_id, test_case):
    skip_if_model_doesnt_support_json_schema_structured_output(client_with_models, text_model_id)
    class NBAStats(BaseModel):
        year_for_draft: int
        num_seasons_in_nba: int
    class AnswerFormat(BaseModel):
        first_name: str
        last_name: str
        year_of_birth: int
        nba_stats: NBAStats
    tc = TestCase(test_case)
    response = client_with_models.inference.chat_completion(
        model_id=text_model_id,
        messages=tc["messages"],
        response_format={
            "type": "json_schema",
            "json_schema": AnswerFormat.model_json_schema(),
        },
        stream=False,
    )
    answer = AnswerFormat.model_validate_json(response.completion_message.content)
    expected = tc["expected"]
    assert answer.first_name == expected["first_name"]
    assert answer.last_name == expected["last_name"]
    assert answer.year_of_birth == expected["year_of_birth"]
    assert answer.nba_stats.num_seasons_in_nba == expected["num_seasons_in_nba"]
    assert answer.nba_stats.year_for_draft == expected["year_for_draft"]
@pytest.mark.parametrize("streaming", [True, False])
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:chat_completion:tool_calling_tools_absent",
    ],
 )
 def test_text_chat_completion_tool_calling_tools_not_in_request(
    client_with_models, text_model_id, test_case, streaming
 ):
    tc = TestCase(test_case)
    # TODO: more dynamic lookup on tool_prompt_format for model family
    tool_prompt_format = "json" if "3.1" in text_model_id else "python_list"
    request = {
        "model_id": text_model_id,
        "messages": tc["messages"],
        "tools": tc["tools"],
        "tool_choice": "auto",
        "tool_prompt_format": tool_prompt_format,
        "stream": streaming,
    }
    response = client_with_models.inference.chat_completion(**request)
    if streaming:
        for chunk in response:
            delta = chunk.event.delta
            if delta.type == "tool_call" and delta.parse_status == "succeeded":
                assert delta.tool_call.tool_name == "get_object_namespace_list"
            if delta.type == "tool_call" and delta.parse_status == "failed":
                # expect raw message that failed to parse in tool_call
                assert isinstance(delta.tool_call, str)
                assert len(delta.tool_call) > 0
    else:
        for tc in response.completion_message.tool_calls:
            assert tc.tool_name == "get_object_namespace_list"
@pytest.mark.parametrize(
    "test_case",
    [
        # Tests if the model can handle simple messages like "Hi" or
        # a message unrelated to one of the tool calls
        "inference:chat_completion:text_then_tool",
        # Tests if the model can do full tool call with responses correctly
        "inference:chat_completion:tool_then_answer",
        # Tests if model can generate multiple params and
        # read outputs correctly
        "inference:chat_completion:array_parameter",
    ],
 )
 def test_text_chat_completion_with_multi_turn_tool_calling(client_with_models, text_model_id, test_case):
    """This test tests the model's tool calling loop in various scenarios"""
    if "llama-4" not in text_model_id.lower() and "llama4" not in text_model_id.lower():
        pytest.xfail("Not tested for non-llama4 models yet")
    tc = TestCase(test_case)
    messages = []
    # keep going until either
    # 1. we have messages to test in multi-turn
    # 2. no messages bust last message is tool response
    while len(tc["messages"]) > 0 or (len(messages) > 0 and messages[-1]["role"] == "tool"):
        # do not take new messages if last message is tool response
        if len(messages) == 0 or messages[-1]["role"] != "tool":
            new_messages = tc["messages"].pop(0)
            messages += new_messages
        # pprint(messages)
        response = client_with_models.inference.chat_completion(
            model_id=text_model_id,
            messages=messages,
            tools=tc["tools"],
            stream=False,
            sampling_params={
                "strategy": {
                    "type": "top_p",
                    "top_p": 0.9,
                    "temperature": 0.6,
                }
            },
        )
        op_msg = response.completion_message
        messages.append(op_msg.model_dump())
        # print(op_msg)
        assert op_msg.role == "assistant"
        expected = tc["expected"].pop(0)
        assert len(op_msg.tool_calls) == expected["num_tool_calls"]
        if expected["num_tool_calls"] > 0:
            assert op_msg.tool_calls[0].tool_name == expected["tool_name"]
            assert op_msg.tool_calls[0].arguments == expected["tool_arguments"]
            tool_response = tc["tool_responses"].pop(0)
            messages.append(
                # Tool Response Message
                {
                    "role": "tool",
                    "call_id": op_msg.tool_calls[0].call_id,
                    "content": tool_response["response"],
                }
            )
        else:
            actual_answer = op_msg.content.lower()
            # pprint(actual_answer)
            assert expected["answer"] in actual_answer
        # sleep to avoid rate limit
        sleep(1)
--- a/tests/integration/inference/test_vision_inference.py
+++ b/tests/integration/inference/test_vision_inference.py
@ -25,16 +25,19 @@ def base64_image_data(image_path):
    return base64.b64encode(image_path.read_bytes()).decode("utf-8")
@pytest.fixture
 def base64_image_url(base64_image_data):
    return f"data:image/png;base64,{base64_image_data}"
 def test_image_chat_completion_non_streaming(client_with_models, vision_model_id):
    message = {
        "role": "user",
        "content": [
            {
-                "type": "image",
+                "type": "image_url",
-                "image": {
+                "image_url": {
-                    "url": {
+                    "url": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
                    },
                },
            },
            {
@ -43,12 +46,12 @@ def test_image_chat_completion_non_streaming(client_with_models, vision_model_id
            },
        ],
    }
-    response = client_with_models.inference.chat_completion(
+    response = client_with_models.chat.completions.create(
-        model_id=vision_model_id,
+        model=vision_model_id,
        messages=[message],
        stream=False,
    )
-    message_content = response.completion_message.content.lower().strip()
+    message_content = response.choices[0].message.content.lower().strip()
    assert len(message_content) > 0
    assert any(expected in message_content for expected in {"dog", "puppy", "pup"})
@ -68,8 +71,13 @@ def multi_image_data():
    return encoded_files
@pytest.fixture
 def multi_image_url(multi_image_data):
    return [f"data:image/jpeg;base64,{data}" for data in multi_image_data]
@pytest.mark.parametrize("stream", [True, False])
-def test_image_chat_completion_multiple_images(client_with_models, vision_model_id, multi_image_data, stream):
+def test_image_chat_completion_multiple_images(client_with_models, vision_model_id, multi_image_url, stream):
    supported_models = ["llama-4", "gpt-4o", "llama4"]
    if not any(model in vision_model_id.lower() for model in supported_models):
        pytest.skip(
@ -81,15 +89,15 @@ def test_image_chat_completion_multiple_images(client_with_models, vision_model_
            "role": "user",
            "content": [
                {
-                    "type": "image",
+                    "type": "image_url",
-                    "image": {
+                    "image_url": {
-                        "data": multi_image_data[0],
+                        "url": multi_image_url[0],
                    },
                },
                {
-                    "type": "image",
+                    "type": "image_url",
-                    "image": {
+                    "image_url": {
-                        "data": multi_image_data[1],
+                        "url": multi_image_url[1],
                    },
                },
                {
@ -99,17 +107,17 @@ def test_image_chat_completion_multiple_images(client_with_models, vision_model_
            ],
        },
    ]
-    response = client_with_models.inference.chat_completion(
+    response = client_with_models.chat.completions.create(
-        model_id=vision_model_id,
+        model=vision_model_id,
        messages=messages,
        stream=stream,
    )
    if stream:
        message_content = ""
        for chunk in response:
-            message_content += chunk.event.delta.text
+            message_content += chunk.choices[0].delta.content
    else:
-        message_content = response.completion_message.content
+        message_content = response.choices[0].message.content
    assert len(message_content) > 0
    assert any(expected in message_content.lower().strip() for expected in {"bedroom"}), message_content
@ -125,17 +133,17 @@ def test_image_chat_completion_multiple_images(client_with_models, vision_model_
            "role": "user",
            "content": [
                {
-                    "type": "image",
+                    "type": "image_url",
-                    "image": {
+                    "image_url": {
-                        "data": multi_image_data[2],
+                        "url": multi_image_data[2],
                    },
                },
                {"type": "text", "text": "How about this one?"},
            ],
        },
    )
-    response = client_with_models.inference.chat_completion(
+    response = client_with_models.chat.completions.create(
-        model_id=vision_model_id,
+        model=vision_model_id,
        messages=messages,
        stream=stream,
    )
@ -144,7 +152,7 @@ def test_image_chat_completion_multiple_images(client_with_models, vision_model_
        for chunk in response:
            message_content += chunk.event.delta.text
    else:
-        message_content = response.completion_message.content
+        message_content = response.choices[0].message.content
    assert len(message_content) > 0
    assert any(expected in message_content.lower().strip() for expected in {"sword", "shield"}), message_content
@ -154,11 +162,9 @@ def test_image_chat_completion_streaming(client_with_models, vision_model_id):
        "role": "user",
        "content": [
            {
-                "type": "image",
+                "type": "image_url",
-                "image": {
+                "image_url": {
-                    "url": {
+                    "url": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
                    },
                },
            },
            {
@ -167,23 +173,23 @@ def test_image_chat_completion_streaming(client_with_models, vision_model_id):
            },
        ],
    }
-    response = client_with_models.inference.chat_completion(
+    response = client_with_models.chat.completions.create(
-        model_id=vision_model_id,
+        model=vision_model_id,
        messages=[message],
        stream=True,
    )
    streamed_content = ""
    for chunk in response:
-        streamed_content += chunk.event.delta.text.lower()
+        streamed_content += chunk.choices[0].delta.content.lower()
    assert len(streamed_content) > 0
    assert any(expected in streamed_content for expected in {"dog", "puppy", "pup"})
-def test_image_chat_completion_base64(client_with_models, vision_model_id, base64_image_data):
+def test_image_chat_completion_base64(client_with_models, vision_model_id, base64_image_url):
    image_spec = {
-        "type": "image",
+        "type": "image_url",
-        "image": {
+        "image_url": {
-            "data": base64_image_data,
+            "url": base64_image_url,
        },
    }
@ -197,10 +203,10 @@ def test_image_chat_completion_base64(client_with_models, vision_model_id, base6
            },
        ],
    }
-    response = client_with_models.inference.chat_completion(
+    response = client_with_models.chat.completions.create(
-        model_id=vision_model_id,
+        model=vision_model_id,
        messages=[message],
        stream=False,
    )
-    message_content = response.completion_message.content.lower().strip()
+    message_content = response.choices[0].message.content.lower().strip()
    assert len(message_content) > 0
--- a/tests/integration/providers/nvidia/test_datastore.py
+++ b/tests/integration/providers/nvidia/test_datastore.py
@ -14,6 +14,13 @@ from . import skip_in_github_actions
 # LLAMA_STACK_CONFIG="nvidia" pytest -v tests/integration/providers/nvidia/test_datastore.py
@pytest.fixture(autouse=True)
 def skip_if_no_nvidia_provider(llama_stack_client):
    provider_types = {p.provider_type for p in llama_stack_client.providers.list() if p.api == "datasetio"}
    if "remote::nvidia" not in provider_types:
        pytest.skip("datasetio=remote::nvidia provider not configured, skipping")
 # nvidia provider only
@skip_in_github_actions
@pytest.mark.parametrize(
--- a/tests/integration/recordings/responses/168daab89068.json
+++ b/tests/integration/recordings/responses/168daab89068.json
@ -0,0 +1,167 @@
 {
  "request": {
    "method": "POST",
    "url": "http://localhost:11434/api/generate",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
      "raw": true,
      "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant. You have access to functions, but you should only use them if they are required.\nYou are an expert in composing functions. You are given a question and a set of possible functions.\nBased on the question, you may or may not need to make one function/tool call to achieve the purpose.\n\nIf you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]\nIf you decide to invoke a function, you SHOULD NOT include any other text in the response. besides the function call in the above format.\nFor a boolean parameter, be sure to use `True` or `False` (capitalized) for the value.\n\n\nHere is a list of functions in JSON format that you can invoke.\n\n[\n    {\n        \"name\": \"greet_everyone\",\n        \"description\": \"\",\n        \"parameters\": {\n            \"type\": \"dict\",\n            \"required\": [\"url\"],\n            \"properties\": {\n                \"url\": {\n                    \"type\": \"string\",\n                    \"description\": \"\"\n                }\n            }\n        }\n    },\n    {\n        \"name\": \"get_boiling_point\",\n        \"description\": \"\n        Returns the boiling point of a liquid in Celsius or Fahrenheit.\n\n        :param liquid_name: The name of the liquid\n        :param celsius: Whether to return the boiling point in Celsius\n        :return: The boiling point of the liquid in Celcius or Fahrenheit\n        \",\n        \"parameters\": {\n            \"type\": \"dict\",\n            \"required\": [\"liquid_name\"],\n            \"properties\": {\n                \"liquid_name\": {\n                    \"type\": \"string\",\n                    \"description\": \"\"\n                },\n                \"celsius\": {\n                    \"type\": \"boolean\",\n                    \"description\": \"\",\n                    \"default\": \"True\"\n                }\n            }\n        }\n    }\n]\n\nYou can answer general questions or invoke tools when necessary.\nIn addition to tool calls, you should also augment your responses by using the tool outputs.\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSay hi to the world. Use tools to do so.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n[greet_everyone(url=\"world\")]<|eot_id|><|start_header_id|>ipython<|end_header_id|>\n\nHello, world!<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
      "options": {
        "temperature": 0.0
      },
      "stream": true
    },
    "endpoint": "/api/generate",
    "model": "llama3.2:3b-instruct-fp16"
  },
  "response": {
    "body": [
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:56.663224Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "How",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:56.706706Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": " can",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:56.751075Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": " I",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:56.794187Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": " assist",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:56.837831Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": " you",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:56.879926Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": " further",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:56.92182Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "?",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:56.963339Z",
          "done": true,
          "done_reason": "stop",
          "total_duration": 492973041,
          "load_duration": 103979375,
          "prompt_eval_count": 482,
          "prompt_eval_duration": 87032041,
          "eval_count": 8,
          "eval_duration": 300586375,
          "response": "",
          "thinking": null,
          "context": null
        }
      }
    ],
    "is_streaming": true
  }
 }
--- a/tests/integration/recordings/responses/1c0a34fa2e0c.json
+++ b/tests/integration/recordings/responses/1c0a34fa2e0c.json
@ -0,0 +1,31 @@
 {
  "request": {
    "method": "POST",
    "url": "https://api.fireworks.ai/inference/v1/v1/embeddings",
    "headers": {},
    "body": {
      "model": "accounts/fireworks/models/qwen3-embedding-8b",
      "input": [],
      "encoding_format": "float"
    },
    "endpoint": "/v1/embeddings",
    "model": "accounts/fireworks/models/qwen3-embedding-8b"
  },
  "response": {
    "body": {
      "__type__": "openai.types.create_embedding_response.CreateEmbeddingResponse",
      "__data__": {
        "data": [],
        "model": "accounts/fireworks/models/qwen3-embedding-8b",
        "object": "list",
        "usage": {
          "prompt_tokens": 0,
          "total_tokens": 0,
          "completion_tokens": 0
        },
        "perf_metrics": null
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/235c36771a8a.json
+++ b/tests/integration/recordings/responses/235c36771a8a.json
--- a/tests/integration/recordings/responses/239f4768f5aa.json
+++ b/tests/integration/recordings/responses/239f4768f5aa.json
@ -0,0 +1,89 @@
 {
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
      "messages": [
        {
          "role": "system",
          "content": "You are a helpful assistant. Michael Jordan was born in 1963. He played basketball for the Chicago Bulls for 15 seasons."
        },
        {
          "role": "user",
          "content": "Please give me information about Michael Jordan."
        }
      ],
      "response_format": {
        "type": "json_schema",
        "json_schema": {
          "name": "AnswerFormat",
          "schema": {
            "properties": {
              "first_name": {
                "title": "First Name",
                "type": "string"
              },
              "last_name": {
                "title": "Last Name",
                "type": "string"
              },
              "year_of_birth": {
                "title": "Year Of Birth",
                "type": "integer"
              }
            },
            "required": [
              "first_name",
              "last_name",
              "year_of_birth"
            ],
            "title": "AnswerFormat",
            "type": "object"
          }
        }
      },
      "stream": false
    },
    "endpoint": "/v1/chat/completions",
    "model": "llama3.2:3b-instruct-fp16"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "chatcmpl-433",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "{\"first_name\": \"Michael\", \"last_name\": \"Jordan\", \"year_of_birth\": 1963}\n\n   \t\t\t\t\t\t\t\t\t\t\t \t\t   ",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": null
            }
          }
        ],
        "created": 1758979490,
        "model": "llama3.2:3b-instruct-fp16",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": "fp_ollama",
        "usage": {
          "completion_tokens": 31,
          "prompt_tokens": 60,
          "total_tokens": 91,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/24e106063719.json
+++ b/tests/integration/recordings/responses/24e106063719.json
@ -0,0 +1,31 @@
 {
  "request": {
    "method": "POST",
    "url": "https://api.fireworks.ai/inference/v1/v1/embeddings",
    "headers": {},
    "body": {
      "model": "accounts/fireworks/models/qwen3-embedding-8b",
      "input": [],
      "encoding_format": "base64"
    },
    "endpoint": "/v1/embeddings",
    "model": "accounts/fireworks/models/qwen3-embedding-8b"
  },
  "response": {
    "body": {
      "__type__": "openai.types.create_embedding_response.CreateEmbeddingResponse",
      "__data__": {
        "data": [],
        "model": "accounts/fireworks/models/qwen3-embedding-8b",
        "object": "list",
        "usage": {
          "prompt_tokens": 0,
          "total_tokens": 0,
          "completion_tokens": 0
        },
        "perf_metrics": null
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/25649d730247.json
+++ b/tests/integration/recordings/responses/25649d730247.json
@ -0,0 +1,316 @@
 {
  "request": {
    "method": "POST",
    "url": "https://api.fireworks.ai/inference/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
      "messages": [
        {
          "role": "user",
          "content": "What's the name of the Sun in latin?"
        }
      ],
      "stream": true
    },
    "endpoint": "/v1/chat/completions",
    "model": "accounts/fireworks/models/llama-v3p1-8b-instruct"
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920389,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
          "choices": [
            {
              "delta": {
                "content": "The Latin",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920389,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
          "choices": [
            {
              "delta": {
                "content": " name",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920389,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
          "choices": [
            {
              "delta": {
                "content": " for",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920389,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
          "choices": [
            {
              "delta": {
                "content": " the",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920389,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
          "choices": [
            {
              "delta": {
                "content": " Sun",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920389,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
          "choices": [
            {
              "delta": {
                "content": " is",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920389,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
          "choices": [
            {
              "delta": {
                "content": " \"",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920389,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
          "choices": [
            {
              "delta": {
                "content": "Sol",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920389,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
          "choices": [
            {
              "delta": {
                "content": "\".",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920389,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "3745da23-2db2-45a1-8ea5-2a09bbdb6a33",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": "stop",
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920389,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": {
            "completion_tokens": 11,
            "prompt_tokens": 20,
            "total_tokens": 31,
            "completion_tokens_details": null,
            "prompt_tokens_details": null
          }
        }
      }
    ],
    "is_streaming": true
  }
 }
--- a/tests/integration/recordings/responses/2a5a4e821bc8.json
+++ b/tests/integration/recordings/responses/2a5a4e821bc8.json
@ -0,0 +1,44 @@
 {
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/completions",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
      "prompt": "Hello, world!",
      "logprobs": false,
      "stream": false,
      "extra_body": {}
    },
    "endpoint": "/v1/completions",
    "model": "llama3.2:3b-instruct-fp16"
  },
  "response": {
    "body": {
      "__type__": "openai.types.completion.Completion",
      "__data__": {
        "id": "cmpl-74",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "text": "Hello! How can I assist you today?"
          }
        ],
        "created": 1758975636,
        "model": "llama3.2:3b-instruct-fp16",
        "object": "text_completion",
        "system_fingerprint": "fp_ollama",
        "usage": {
          "completion_tokens": 10,
          "prompt_tokens": 29,
          "total_tokens": 39,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/2fef6eda9cd7.json
+++ b/tests/integration/recordings/responses/2fef6eda9cd7.json
--- a/tests/integration/recordings/responses/38ea441b5f83.json
+++ b/tests/integration/recordings/responses/38ea441b5f83.json
@ -0,0 +1,92 @@
 {
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
      "messages": [
        {
          "role": "system",
          "content": "Pretend you are a weather assistant."
        },
        {
          "role": "user",
          "content": "What's the weather like in San Francisco, CA?"
        }
      ],
      "stream": false,
      "tool_choice": "auto",
      "tools": [
        {
          "type": "function",
          "function": {
            "name": "get_weather",
            "description": "Get the current weather",
            "parameters": {
              "type": "object",
              "properties": {
                "location": {
                  "type": "string",
                  "description": "The city and state (both required), e.g. San Francisco, CA."
                }
              },
              "required": [
                "location"
              ]
            }
          }
        }
      ]
    },
    "endpoint": "/v1/chat/completions",
    "model": "llama3.2:3b-instruct-fp16"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "chatcmpl-761",
        "choices": [
          {
            "finish_reason": "tool_calls",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": [
                {
                  "id": "call_cj8ownwc",
                  "function": {
                    "arguments": "{\"location\":\"San Francisco, CA\"}",
                    "name": "get_weather"
                  },
                  "type": "function",
                  "index": 0
                }
              ]
            }
          }
        ],
        "created": 1758975113,
        "model": "llama3.2:3b-instruct-fp16",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": "fp_ollama",
        "usage": {
          "completion_tokens": 18,
          "prompt_tokens": 185,
          "total_tokens": 203,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/5b2088233334.json
+++ b/tests/integration/recordings/responses/5b2088233334.json
@ -0,0 +1,44 @@
 {
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/completions",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
      "prompt": "Hello, world!",
      "logprobs": true,
      "stream": false,
      "extra_body": {}
    },
    "endpoint": "/v1/completions",
    "model": "llama3.2:3b-instruct-fp16"
  },
  "response": {
    "body": {
      "__type__": "openai.types.completion.Completion",
      "__data__": {
        "id": "cmpl-809",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "text": "Hello! It's nice to meet you. Is there anything I can help you with or would you like to chat?"
          }
        ],
        "created": 1758975633,
        "model": "llama3.2:3b-instruct-fp16",
        "object": "text_completion",
        "system_fingerprint": "fp_ollama",
        "usage": {
          "completion_tokens": 25,
          "prompt_tokens": 29,
          "total_tokens": 54,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/651af76045af.json
+++ b/tests/integration/recordings/responses/651af76045af.json
@ -0,0 +1,550 @@
 {
  "request": {
    "method": "POST",
    "url": "https://api.fireworks.ai/inference/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
      "messages": [
        {
          "role": "user",
          "content": "What is the name of the US captial?"
        }
      ],
      "stream": true
    },
    "endpoint": "/v1/chat/completions",
    "model": "accounts/fireworks/models/llama-v3p1-8b-instruct"
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": "The name",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": " of",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": " the",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": " US",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": " capital",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": " is",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": " Washington",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": ",",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": " D",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": ".C",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": ".",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": " (",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": "short",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": " for",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": " District",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": " of",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": " Columbia",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": ").",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "f72b5be3-a677-4c38-b6ae-8c7e5cc4bf29",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": "stop",
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920398,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": {
            "completion_tokens": 20,
            "prompt_tokens": 20,
            "total_tokens": 40,
            "completion_tokens_details": null,
            "prompt_tokens_details": null
          }
        }
      }
    ],
    "is_streaming": true
  }
 }
--- a/tests/integration/recordings/responses/65c12de0a1db.json
+++ b/tests/integration/recordings/responses/65c12de0a1db.json
@ -0,0 +1,60 @@
 {
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
      "messages": [
        {
          "role": "system",
          "content": "Pretend you are a weather assistant."
        },
        {
          "role": "user",
          "content": "What's the weather like in San Francisco, CA?"
        }
      ],
      "stream": false
    },
    "endpoint": "/v1/chat/completions",
    "model": "llama3.2:3b-instruct-fp16"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "chatcmpl-123",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "Hello! As of my knowledge cutoff on December 15th, I have the latest information for you. However, please note that my data may not be entirely up-to-date.\n\nCurrently, and based on historical climate patterns, it appears to be a partly cloudy day with mild temperatures in San Francisco, CA. Expect a temperature range of around 48\u00b0F (9\u00b0C) to 54\u00b0F (12\u00b0C). It's likely to be a breezy day, with winds blowing at about 13 mph (21 km/h).\n\nHowever, if I were to look into more recent weather patterns or forecasts, I would recommend checking the latest conditions directly from reliable sources such as the National Weather Service or local news outlets for more accurate and up-to-date information.\n\nPlease let me know how I can further assist you.",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": null
            }
          }
        ],
        "created": 1758978071,
        "model": "llama3.2:3b-instruct-fp16",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": "fp_ollama",
        "usage": {
          "completion_tokens": 163,
          "prompt_tokens": 45,
          "total_tokens": 208,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/7eace23f03df.json
+++ b/tests/integration/recordings/responses/7eace23f03df.json
--- a/tests/integration/recordings/responses/88ce59013228.json
+++ b/tests/integration/recordings/responses/88ce59013228.json
@ -0,0 +1,56 @@
 {
  "request": {
    "method": "POST",
    "url": "https://api.fireworks.ai/inference/v1/v1/embeddings",
    "headers": {},
    "body": {
      "model": "accounts/fireworks/models/qwen3-embedding-8b",
      "input": "Test dimensions parameter",
      "encoding_format": "float",
      "dimensions": 16
    },
    "endpoint": "/v1/embeddings",
    "model": "accounts/fireworks/models/qwen3-embedding-8b"
  },
  "response": {
    "body": {
      "__type__": "openai.types.create_embedding_response.CreateEmbeddingResponse",
      "__data__": {
        "data": [
          {
            "embedding": [
              -0.9296875,
              5.1875,
              -2.140625,
              0.171875,
              -2.25,
              -0.8359375,
              -0.828125,
              1.15625,
              2.328125,
              -1.0078125,
              -3.0,
              4.09375,
              0.8359375,
              0.1015625,
              2.015625,
              -1.0859375
            ],
            "index": 0,
            "object": "embedding",
            "raw_output": null
          }
        ],
        "model": "accounts/fireworks/models/qwen3-embedding-8b",
        "object": "list",
        "usage": {
          "prompt_tokens": 5,
          "total_tokens": 5,
          "completion_tokens": 0
        },
        "perf_metrics": null
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/8cdb7e65fcfe.json
+++ b/tests/integration/recordings/responses/8cdb7e65fcfe.json
@ -0,0 +1,39 @@
 {
  "request": {
    "method": "POST",
    "url": "https://api.fireworks.ai/inference/v1/v1/embeddings",
    "headers": {},
    "body": {
      "model": "accounts/fireworks/models/qwen3-embedding-8b",
      "input": "Test dimensions parameter",
      "encoding_format": "base64",
      "dimensions": 16
    },
    "endpoint": "/v1/embeddings",
    "model": "accounts/fireworks/models/qwen3-embedding-8b"
  },
  "response": {
    "body": {
      "__type__": "openai.types.create_embedding_response.CreateEmbeddingResponse",
      "__data__": {
        "data": [
          {
            "embedding": "AABuvwAApkAAAAnAAAAwPgAAEMAAAFa/AABUvwAAlD8AABVAAACBvwAAQMAAAINAAABWPwAA0D0AAAFAAACLvw==",
            "index": 0,
            "object": "embedding",
            "raw_output": null
          }
        ],
        "model": "accounts/fireworks/models/qwen3-embedding-8b",
        "object": "list",
        "usage": {
          "prompt_tokens": 5,
          "total_tokens": 5,
          "completion_tokens": 0
        },
        "perf_metrics": null
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/901b5e7db4aa.json
+++ b/tests/integration/recordings/responses/901b5e7db4aa.json
@ -0,0 +1,56 @@
 {
  "request": {
    "method": "POST",
    "url": "https://api.fireworks.ai/inference/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
      "messages": [
        {
          "role": "user",
          "content": "Hello, world!"
        }
      ],
      "stream": false
    },
    "endpoint": "/v1/chat/completions",
    "model": "accounts/fireworks/models/llama-v3p1-8b-instruct"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "1d64ff81-b7c4-40c6-9509-cca71759da3e",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat?",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": null
            }
          }
        ],
        "created": 1758920401,
        "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 26,
          "prompt_tokens": 14,
          "total_tokens": 40,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/949d3ad16367.json
+++ b/tests/integration/recordings/responses/949d3ad16367.json
@ -0,0 +1,347 @@
 {
  "request": {
    "method": "POST",
    "url": "http://localhost:11434/api/generate",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
      "raw": true,
      "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant. You have access to functions, but you should only use them if they are required.\nYou are an expert in composing functions. You are given a question and a set of possible functions.\nBased on the question, you may or may not need to make one function/tool call to achieve the purpose.\n\nIf you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]\nIf you decide to invoke a function, you SHOULD NOT include any other text in the response. besides the function call in the above format.\nFor a boolean parameter, be sure to use `True` or `False` (capitalized) for the value.\n\n\nHere is a list of functions in JSON format that you can invoke.\n\n[\n    {\n        \"name\": \"greet_everyone\",\n        \"description\": \"\",\n        \"parameters\": {\n            \"type\": \"dict\",\n            \"required\": [\"url\"],\n            \"properties\": {\n                \"url\": {\n                    \"type\": \"string\",\n                    \"description\": \"\"\n                }\n            }\n        }\n    },\n    {\n        \"name\": \"get_boiling_point\",\n        \"description\": \"\n        Returns the boiling point of a liquid in Celsius or Fahrenheit.\n\n        :param liquid_name: The name of the liquid\n        :param celsius: Whether to return the boiling point in Celsius\n        :return: The boiling point of the liquid in Celcius or Fahrenheit\n        \",\n        \"parameters\": {\n            \"type\": \"dict\",\n            \"required\": [\"liquid_name\"],\n            \"properties\": {\n                \"liquid_name\": {\n                    \"type\": \"string\",\n                    \"description\": \"\"\n                },\n                \"celsius\": {\n                    \"type\": \"boolean\",\n                    \"description\": \"\",\n                    \"default\": \"True\"\n                }\n            }\n        }\n    }\n]\n\nYou can answer general questions or invoke tools when necessary.\nIn addition to tool calls, you should also augment your responses by using the tool outputs.\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSay hi to the world. Use tools to do so.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n[greet_everyone(url=\"world\")]<|eot_id|><|start_header_id|>ipython<|end_header_id|>\n\nHello, world!<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHow can I assist you further?<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the boiling point of polyjuice? Use tools to answer.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
      "options": {
        "temperature": 0.0
      },
      "stream": true
    },
    "endpoint": "/api/generate",
    "model": "llama3.2:3b-instruct-fp16"
  },
  "response": {
    "body": [
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:57.177453Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "[",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:57.220271Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "get",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:57.261232Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "_bo",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:57.302818Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "iling",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:57.344343Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "_point",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:57.386025Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "(",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:57.42778Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "liquid",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:57.469673Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "_name",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:57.512543Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "='",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:57.554479Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "poly",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:57.597092Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "ju",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:57.639581Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "ice",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:57.683223Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "',",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:57.72556Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": " c",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:57.768012Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "elsius",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:57.8098Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "=True",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:57.851578Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": ")]",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:57.893693Z",
          "done": true,
          "done_reason": "stop",
          "total_duration": 885274541,
          "load_duration": 99578333,
          "prompt_eval_count": 514,
          "prompt_eval_duration": 67915875,
          "eval_count": 18,
          "eval_duration": 717086791,
          "response": "",
          "thinking": null,
          "context": null
        }
      }
    ],
    "is_streaming": true
  }
 }
--- a/tests/integration/recordings/responses/969a9a757e0c.json
+++ b/tests/integration/recordings/responses/969a9a757e0c.json
@ -0,0 +1,74 @@
 {
  "request": {
    "method": "POST",
    "url": "https://api.fireworks.ai/inference/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
      "messages": [
        {
          "role": "user",
          "content": "What's the weather in Tokyo? Use the get_weather function to get the weather."
        }
      ],
      "stream": false,
      "tools": [
        {
          "type": "function",
          "function": {
            "name": "get_weather",
            "description": "Get the weather in a given city",
            "parameters": {
              "type": "object",
              "properties": {
                "city": {
                  "type": "string",
                  "description": "The city to get the weather for"
                }
              }
            }
          }
        }
      ]
    },
    "endpoint": "/v1/chat/completions",
    "model": "accounts/fireworks/models/llama-v3p1-8b-instruct"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "0fe94e7d-f25b-4843-ba0a-e402e0764830",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "I can\u2019t help with that. If you're looking for current weather information, I recommend checking a weather website or app, such as AccuWeather or Weather.com. Is there anything else I can help you with?",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": null
            }
          }
        ],
        "created": 1758920402,
        "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 45,
          "prompt_tokens": 27,
          "total_tokens": 72,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/9c1a4c5336a7.json
+++ b/tests/integration/recordings/responses/9c1a4c5336a7.json
--- a/tests/integration/recordings/responses/a369881bb3a2.json
+++ b/tests/integration/recordings/responses/a369881bb3a2.json
@ -0,0 +1,55 @@
 {
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
      "messages": [
        {
          "role": "user",
          "content": "Test trace 0"
        }
      ]
    },
    "endpoint": "/v1/chat/completions",
    "model": "llama3.2:3b-instruct-fp16"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "chatcmpl-272",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "I'm happy to help you with a test. Since we are in the middle of a text-based conversation, I'll do my best to simulate a simple test tracing process.\n\n**Trace Test Results**\n\nTo perform this test, please follow these steps:\n\n1. Type \"test\" on command mode.\n2. Press Enter.\n\nNow, let's start tracing...\n\nTest Tracing Results:\nTest Case: General Functions\nTest Case Result: PASS\n\nSystem Response:\n\n```\n# System Boot Time: 2023-10-13T14:30:00\n# CPU Temperature: 35\u00b0C\n# Disk Space Available: 80%\n```\n\nNext Steps?\n\nType 'done' to exit the test, or 'run' for more tests.",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": null
            }
          }
        ],
        "created": 1758978134,
        "model": "llama3.2:3b-instruct-fp16",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": "fp_ollama",
        "usage": {
          "completion_tokens": 152,
          "prompt_tokens": 29,
          "total_tokens": 181,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/a46b77ffd494.json
+++ b/tests/integration/recordings/responses/a46b77ffd494.json
@ -0,0 +1,44 @@
 {
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/completions",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
      "prompt": "Return the exact same sentence and don't add additional words): Michael Jordan was born in the year of 1963",
      "stop": "1963",
      "stream": false,
      "extra_body": {}
    },
    "endpoint": "/v1/completions",
    "model": "llama3.2:3b-instruct-fp16"
  },
  "response": {
    "body": {
      "__type__": "openai.types.completion.Completion",
      "__data__": {
        "id": "cmpl-183",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "text": "Michael Jordan was born in the year of "
          }
        ],
        "created": 1758978053,
        "model": "llama3.2:3b-instruct-fp16",
        "object": "text_completion",
        "system_fingerprint": "fp_ollama",
        "usage": {
          "completion_tokens": 11,
          "prompt_tokens": 48,
          "total_tokens": 59,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/aa745b14fe67.json
+++ b/tests/integration/recordings/responses/aa745b14fe67.json
--- a/tests/integration/recordings/responses/c3dbccc5de74.json
+++ b/tests/integration/recordings/responses/c3dbccc5de74.json
@ -0,0 +1,112 @@
 {
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
      "messages": [
        {
          "role": "system",
          "content": "Pretend you are a weather assistant."
        },
        {
          "role": "user",
          "content": "What's the weather like in San Francisco, CA?"
        }
      ],
      "stream": true,
      "tool_choice": "auto",
      "tools": [
        {
          "type": "function",
          "function": {
            "name": "get_weather",
            "description": "Get the current weather",
            "parameters": {
              "type": "object",
              "properties": {
                "location": {
                  "type": "string",
                  "description": "The city and state (both required), e.g. San Francisco, CA."
                }
              },
              "required": [
                "location"
              ]
            }
          }
        }
      ]
    },
    "endpoint": "/v1/chat/completions",
    "model": "llama3.2:3b-instruct-fp16"
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-634",
          "choices": [
            {
              "delta": {
                "content": "",
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": [
                  {
                    "index": 0,
                    "id": "call_wubm4yax",
                    "function": {
                      "arguments": "{\"location\":\"San Francisco, CA\"}",
                      "name": "get_weather"
                    },
                    "type": "function"
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758975115,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": "fp_ollama",
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-634",
          "choices": [
            {
              "delta": {
                "content": "",
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": null
              },
              "finish_reason": "tool_calls",
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758975115,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": "fp_ollama",
          "usage": null
        }
      }
    ],
    "is_streaming": true
  }
 }
--- a/tests/integration/recordings/responses/c8e196049fe4.json
+++ b/tests/integration/recordings/responses/c8e196049fe4.json
@ -0,0 +1,47 @@
 {
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/completions",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
      "prompt": "Return the exact same sentence and don't add additional words): Michael Jordan was born in the year of 1963",
      "stop": [
        "blathering",
        "1963"
      ],
      "stream": false,
      "extra_body": {}
    },
    "endpoint": "/v1/completions",
    "model": "llama3.2:3b-instruct-fp16"
  },
  "response": {
    "body": {
      "__type__": "openai.types.completion.Completion",
      "__data__": {
        "id": "cmpl-381",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "text": "Michael Jordan was born in the year of "
          }
        ],
        "created": 1758978056,
        "model": "llama3.2:3b-instruct-fp16",
        "object": "text_completion",
        "system_fingerprint": "fp_ollama",
        "usage": {
          "completion_tokens": 11,
          "prompt_tokens": 48,
          "total_tokens": 59,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/ca332c91adee.json
+++ b/tests/integration/recordings/responses/ca332c91adee.json
--- a/tests/integration/recordings/responses/cb1099daed49.json
+++ b/tests/integration/recordings/responses/cb1099daed49.json
@ -0,0 +1,55 @@
 {
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
      "messages": [
        {
          "role": "user",
          "content": "Test trace 1"
        }
      ]
    },
    "endpoint": "/v1/chat/completions",
    "model": "llama3.2:3b-instruct-fp16"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "chatcmpl-122",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "It appears you're trying to initiate a conversation or test the functionality of this AI system. I'm happy to chat with you!\n\nWould you like to:\nA) Ask me a question on a specific topic\nB) Engage in a conversational dialogue on a topic of your choice\nC) Play a text-based game\nD) Test my language understanding capabilities\n\nPlease respond with the letter of your preferred activity.",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": null
            }
          }
        ],
        "created": 1758978142,
        "model": "llama3.2:3b-instruct-fp16",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": "fp_ollama",
        "usage": {
          "completion_tokens": 85,
          "prompt_tokens": 29,
          "total_tokens": 114,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/d0ac68cbde69.json
+++ b/tests/integration/recordings/responses/d0ac68cbde69.json
@ -13,22 +13,23 @@
      "__data__": {
        "models": [
          {
-            "model": "llama3.2-vision:11b",
+            "model": "llama3.2:3b",
-            "name": "llama3.2-vision:11b",
+            "name": "llama3.2:3b",
-            "digest": "6f2f9757ae97e8a3f8ea33d6adb2b11d93d9a35bef277cd2c0b1b5af8e8d0b1e",
+            "digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
-            "expires_at": "2025-09-03T11:51:35.966409-07:00",
+            "expires_at": "2025-09-27T11:54:56.718552-07:00",
-            "size": 12401209008,
+            "size": 3367856128,
-            "size_vram": 12401209008,
+            "size_vram": 3367856128,
            "details": {
              "parent_model": "",
              "format": "gguf",
-              "family": "mllama",
+              "family": "llama",
              "families": [
-                "mllama"
+                "llama"
              ],
-              "parameter_size": "10.7B",
+              "parameter_size": "3.2B",
              "quantization_level": "Q4_K_M"
-            }
+            },
            "context_length": 4096
          }
        ]
      }
--- a/tests/integration/recordings/responses/d10fc0f9ac66.json
+++ b/tests/integration/recordings/responses/d10fc0f9ac66.json
--- a/tests/integration/recordings/responses/d45ca9107508.json
+++ b/tests/integration/recordings/responses/d45ca9107508.json
@ -0,0 +1,43 @@
 {
  "request": {
    "method": "POST",
    "url": "https://api.fireworks.ai/inference/v1/v1/completions",
    "headers": {},
    "body": {
      "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
      "prompt": "Respond to this question and explain your answer. Complete the sentence using one word: Roses are red, violets are ",
      "stream": false,
      "extra_body": {}
    },
    "endpoint": "/v1/completions",
    "model": "accounts/fireworks/models/llama-v3p1-8b-instruct"
  },
  "response": {
    "body": {
      "__type__": "openai.types.completion.Completion",
      "__data__": {
        "id": "1bbb8db5-63e5-40cd-8ffe-59e0e88bf8f0",
        "choices": [
          {
            "finish_reason": "length",
            "index": 0,
            "logprobs": null,
            "text": "4. At the beginning of the year, a woman has $5,000"
          }
        ],
        "created": 1758920353,
        "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
        "object": "text_completion",
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 16,
          "prompt_tokens": 25,
          "total_tokens": 41,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/d927b47032de.json
+++ b/tests/integration/recordings/responses/d927b47032de.json
--- a/tests/integration/recordings/responses/e22f98c05933.json
+++ b/tests/integration/recordings/responses/e22f98c05933.json
--- a/tests/integration/recordings/responses/e4daa5642f6e.json
+++ b/tests/integration/recordings/responses/e4daa5642f6e.json
@ -0,0 +1,56 @@
 {
  "request": {
    "method": "POST",
    "url": "https://api.fireworks.ai/inference/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
      "messages": [
        {
          "role": "user",
          "content": "Which planet has rings around it with a name starting with letter S?"
        }
      ],
      "stream": false
    },
    "endpoint": "/v1/chat/completions",
    "model": "accounts/fireworks/models/llama-v3p1-8b-instruct"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "26632ea9-3481-419d-bc0d-83c177257bc4",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "There are two planets in our solar system with ring systems that have names starting with the letter S:\n\n1. **Saturn** - Its ring system is one of the most iconic and well-known in our solar system. The rings are made up of ice and rock particles that range in size from tiny dust grains to massive boulders.\n2. **Saturn's moon** - The ring system of **Saturn's moon, Rhea**, is sometimes referred to as a \"ring system\" even though it's much smaller and less prominent than Saturn's. However, it's worth noting that Rhea's ring system is not as well-known as Saturn's.\n\nIf you're looking for a planet with a ring system that starts with the letter S and is not a moon, then the answer is Saturn!",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": null
            }
          }
        ],
        "created": 1758920397,
        "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 164,
          "prompt_tokens": 24,
          "total_tokens": 188,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/e61266e87842.json
+++ b/tests/integration/recordings/responses/e61266e87842.json
@ -0,0 +1,185 @@
 {
  "request": {
    "method": "POST",
    "url": "http://localhost:11434/api/generate",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
      "raw": true,
      "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant. You have access to functions, but you should only use them if they are required.\nYou are an expert in composing functions. You are given a question and a set of possible functions.\nBased on the question, you may or may not need to make one function/tool call to achieve the purpose.\n\nIf you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]\nIf you decide to invoke a function, you SHOULD NOT include any other text in the response. besides the function call in the above format.\nFor a boolean parameter, be sure to use `True` or `False` (capitalized) for the value.\n\n\nHere is a list of functions in JSON format that you can invoke.\n\n[\n    {\n        \"name\": \"greet_everyone\",\n        \"description\": \"\",\n        \"parameters\": {\n            \"type\": \"dict\",\n            \"required\": [\"url\"],\n            \"properties\": {\n                \"url\": {\n                    \"type\": \"string\",\n                    \"description\": \"\"\n                }\n            }\n        }\n    },\n    {\n        \"name\": \"get_boiling_point\",\n        \"description\": \"\n        Returns the boiling point of a liquid in Celsius or Fahrenheit.\n\n        :param liquid_name: The name of the liquid\n        :param celsius: Whether to return the boiling point in Celsius\n        :return: The boiling point of the liquid in Celcius or Fahrenheit\n        \",\n        \"parameters\": {\n            \"type\": \"dict\",\n            \"required\": [\"liquid_name\"],\n            \"properties\": {\n                \"liquid_name\": {\n                    \"type\": \"string\",\n                    \"description\": \"\"\n                },\n                \"celsius\": {\n                    \"type\": \"boolean\",\n                    \"description\": \"\",\n                    \"default\": \"True\"\n                }\n            }\n        }\n    }\n]\n\nYou can answer general questions or invoke tools when necessary.\nIn addition to tool calls, you should also augment your responses by using the tool outputs.\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSay hi to the world. Use tools to do so.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
      "options": {
        "temperature": 0.0
      },
      "stream": true
    },
    "endpoint": "/api/generate",
    "model": "llama3.2:3b-instruct-fp16"
  },
  "response": {
    "body": [
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:56.034121Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "[g",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:56.07569Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "reet",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:56.116927Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "_every",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:56.159755Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "one",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:56.201675Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "(url",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:56.243056Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "=\"",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:56.284651Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "world",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:56.326276Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
          "load_duration": null,
          "prompt_eval_count": null,
          "prompt_eval_duration": null,
          "eval_count": null,
          "eval_duration": null,
          "response": "\")]",
          "thinking": null,
          "context": null
        }
      },
      {
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
          "created_at": "2025-09-27T18:05:56.367959Z",
          "done": true,
          "done_reason": "stop",
          "total_duration": 5381441291,
          "load_duration": 4112439791,
          "prompt_eval_count": 459,
          "prompt_eval_duration": 932587833,
          "eval_count": 9,
          "eval_duration": 334328250,
          "response": "",
          "thinking": null,
          "context": null
        }
      }
    ],
    "is_streaming": true
  }
 }
--- a/tests/integration/recordings/responses/e99f14805360.json
+++ b/tests/integration/recordings/responses/e99f14805360.json
@ -0,0 +1,706 @@
 {
  "request": {
    "method": "POST",
    "url": "https://api.fireworks.ai/inference/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
      "messages": [
        {
          "role": "user",
          "content": "Hello, world!"
        }
      ],
      "stream": true
    },
    "endpoint": "/v1/chat/completions",
    "model": "accounts/fireworks/models/llama-v3p1-8b-instruct"
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": "Hello!",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " It",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": "'s",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " nice",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " to",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " meet",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " you",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": ".",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " Is",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " there",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " something",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " I",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " can",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " help",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " you",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " with",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": ",",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " or",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " would",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " you",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " like",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " to",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": " chat",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": "?",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "d583f66e-de11-4210-8153-54be000a2783",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": "stop",
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 1758920391,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": {
            "completion_tokens": 26,
            "prompt_tokens": 14,
            "total_tokens": 40,
            "completion_tokens_details": null,
            "prompt_tokens_details": null
          }
        }
      }
    ],
    "is_streaming": true
  }
 }
--- a/tests/integration/recordings/responses/f3cbd3f07e60.json
+++ b/tests/integration/recordings/responses/f3cbd3f07e60.json
@ -0,0 +1,996 @@
 {
  "request": {
    "method": "POST",
    "url": "https://api.fireworks.ai/inference/v1/v1/completions",
    "headers": {},
    "body": {
      "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
      "prompt": "Respond to this question and explain your answer. Complete the sentence using one word: Roses are red, violets are ",
      "max_tokens": 50,
      "stream": true,
      "extra_body": {}
    },
    "endpoint": "/v1/completions",
    "model": "accounts/fireworks/models/llama-v3p1-8b-instruct"
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " a"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " type"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " of"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " __________________"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": "_____"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": ".\n\n"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": "##"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " Step"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " "
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": "1"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": ":"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " Identify"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " the"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " type"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " of"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " flower"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " mentioned"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " in"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " the"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " sentence"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": ".\n"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": "The"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " sentence"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " mentions"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " \""
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": "vio"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": "lets"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": ".\"\n\n"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": "##"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " Step"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " "
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": "2"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": ":"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " Determine"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " the"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " type"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " of"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " flower"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " v"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": "io"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": "lets"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " are"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": ".\n"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": "V"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": "io"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": "lets"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " are"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " a"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " type"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "text": " of"
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.completion.Completion",
        "__data__": {
          "id": "c9c1f727-afe7-430a-b759-df1dc392266c",
          "choices": [
            {
              "finish_reason": "length",
              "index": 0,
              "logprobs": null,
              "text": ""
            }
          ],
          "created": 1758920354,
          "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "object": "text_completion",
          "system_fingerprint": null,
          "usage": {
            "completion_tokens": 50,
            "prompt_tokens": 25,
            "total_tokens": 75,
            "completion_tokens_details": null,
            "prompt_tokens_details": null
          }
        }
      }
    ],
    "is_streaming": true
  }
 }
--- a/tests/integration/recordings/responses/f6469c4656dd.json
+++ b/tests/integration/recordings/responses/f6469c4656dd.json
--- a/tests/integration/recordings/responses/f701ad342bd8.json
+++ b/tests/integration/recordings/responses/f701ad342bd8.json
@ -0,0 +1,56 @@
 {
  "request": {
    "method": "POST",
    "url": "https://api.fireworks.ai/inference/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
      "messages": [
        {
          "role": "user",
          "content": "Which planet do humans live on?"
        }
      ],
      "stream": false
    },
    "endpoint": "/v1/chat/completions",
    "model": "accounts/fireworks/models/llama-v3p1-8b-instruct"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "0fd60cd7-dc72-45b7-808c-4da91de80093",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "Humans live on a planet called Earth.",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": null
            }
          }
        ],
        "created": 1758920388,
        "model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 9,
          "prompt_tokens": 17,
          "total_tokens": 26,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/models-7d9446738fd7-d5d684a3.json
+++ b/tests/integration/recordings/responses/models-7d9446738fd7-d5d684a3.json
@ -0,0 +1,527 @@
 {
  "request": {
    "method": "POST",
    "url": "https://api.fireworks.ai/inference/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/flux-1-dev-fp8",
          "created": 1729532889,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "FLUMINA_BASE_MODEL",
          "supports_chat": false,
          "supports_image_input": false,
          "supports_tools": false
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/tvergho-87e44d/models/debatecards-70b-ft-3epoch-dpo-v2",
          "created": 1743381121,
          "object": "model",
          "owned_by": "tvergho-87e44d",
          "kind": "HF_PEFT_ADDON",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": false
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/flux-kontext-max",
          "created": 1750714611,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "FLUMINA_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": true,
          "supports_tools": false
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/flux-kontext-pro",
          "created": 1750488264,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "FLUMINA_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": true,
          "supports_tools": false
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/sentientfoundation-serverless/models/dobby-mini-unhinged-plus-llama-3-1-8b",
          "created": 1748467427,
          "object": "model",
          "owned_by": "sentientfoundation-serverless",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": false,
          "context_length": 131072
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/deepseek-v3",
          "created": 1735576668,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": true,
          "context_length": 131072
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/sentientfoundation/models/dobby-unhinged-llama-3-3-70b-new",
          "created": 1739563474,
          "object": "model",
          "owned_by": "sentientfoundation",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": false,
          "context_length": 131072
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/gpt-oss-120b",
          "created": 1754345600,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": true,
          "context_length": 131072
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/qwen3-coder-480b-a35b-instruct",
          "created": 1753211090,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": true,
          "context_length": 262144
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/qwen3-30b-a3b-thinking-2507",
          "created": 1753916446,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": false
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
          "created": 1753124424,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": true,
          "context_length": 262144
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/qwen3-235b-a22b-thinking-2507",
          "created": 1753455434,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": false,
          "context_length": 262144
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/qwen3-embedding-8b",
          "created": 1755707090,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": false,
          "context_length": 40960
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/deepseek-v3-0324",
          "created": 1742827220,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": true,
          "context_length": 163840
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/deepseek-v3p1-terminus",
          "created": 1758586241,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": true,
          "context_length": 163840
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/kimi-k2-instruct",
          "created": 1752259096,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": true,
          "context_length": 131072
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/gpt-oss-20b",
          "created": 1754345466,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": false,
          "context_length": 131072
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/llama4-maverick-instruct-basic",
          "created": 1743878495,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": true,
          "supports_tools": true,
          "context_length": 1048576
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/qwen3-coder-30b-a3b-instruct",
          "created": 1754063588,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": false,
          "context_length": 262144
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/llama-v3p3-70b-instruct",
          "created": 1733442103,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": false,
          "context_length": 131072
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/qwen2p5-vl-32b-instruct",
          "created": 1743392739,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": true,
          "supports_tools": false,
          "context_length": 128000
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/qwen3-235b-a22b",
          "created": 1745885249,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": true,
          "context_length": 131072
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/glm-4p5-air",
          "created": 1754089426,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": true,
          "context_length": 131072
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/deepseek-r1",
          "created": 1737397673,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": false,
          "context_length": 163840
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/llama-v3p1-8b-instruct",
          "created": 1721692808,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": false,
          "context_length": 131072
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/deepseek-r1-basic",
          "created": 1742306746,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": false,
          "context_length": 163840
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/deepseek-v3p1",
          "created": 1755758988,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": true,
          "context_length": 163840
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/flux-1-schnell-fp8",
          "created": 1729535376,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "FLUMINA_BASE_MODEL",
          "supports_chat": false,
          "supports_image_input": false,
          "supports_tools": false
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/glm-4p5",
          "created": 1753809636,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": true,
          "context_length": 131072
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/kimi-k2-instruct-0905",
          "created": 1757018994,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": true,
          "context_length": 262144
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/llama-v3p1-405b-instruct",
          "created": 1721428386,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": true,
          "context_length": 131072
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/llama4-scout-instruct-basic",
          "created": 1743878279,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": true,
          "supports_tools": true,
          "context_length": 1048576
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/qwen3-30b-a3b",
          "created": 1745878133,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": true,
          "context_length": 131072
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/llama-v3p1-70b-instruct",
          "created": 1721287357,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": true,
          "context_length": 131072
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/deepseek-r1-0528",
          "created": 1748456377,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": true,
          "context_length": 163840
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/mixtral-8x22b-instruct",
          "created": 1713375508,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": true,
          "context_length": 65536
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "accounts/fireworks/models/qwen3-30b-a3b-instruct-2507",
          "created": 1753808388,
          "object": "model",
          "owned_by": "fireworks",
          "kind": "HF_BASE_MODEL",
          "supports_chat": true,
          "supports_image_input": false,
          "supports_tools": false,
          "context_length": 262144
        }
      }
    ],
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/models-bd032f995f2a-52e8575f.json
+++ b/tests/integration/recordings/responses/models-bd032f995f2a-52e8575f.json
@ -0,0 +1,834 @@
 {
  "request": {
    "method": "POST",
    "url": "https://api.openai.com/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4-0613",
          "created": 1686588896,
          "object": "model",
          "owned_by": "openai"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4",
          "created": 1687882411,
          "object": "model",
          "owned_by": "openai"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-3.5-turbo",
          "created": 1677610602,
          "object": "model",
          "owned_by": "openai"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-5-codex",
          "created": 1757527818,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-audio-2025-08-28",
          "created": 1756256146,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-realtime",
          "created": 1756271701,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-realtime-2025-08-28",
          "created": 1756271773,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-audio",
          "created": 1756339249,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "davinci-002",
          "created": 1692634301,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "babbage-002",
          "created": 1692634615,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-3.5-turbo-instruct",
          "created": 1692901427,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-3.5-turbo-instruct-0914",
          "created": 1694122472,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "dall-e-3",
          "created": 1698785189,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "dall-e-2",
          "created": 1698798177,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4-1106-preview",
          "created": 1698957206,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-3.5-turbo-1106",
          "created": 1698959748,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "tts-1-hd",
          "created": 1699046015,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "tts-1-1106",
          "created": 1699053241,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "tts-1-hd-1106",
          "created": 1699053533,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "text-embedding-3-small",
          "created": 1705948997,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "text-embedding-3-large",
          "created": 1705953180,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4-0125-preview",
          "created": 1706037612,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4-turbo-preview",
          "created": 1706037777,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-3.5-turbo-0125",
          "created": 1706048358,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4-turbo",
          "created": 1712361441,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4-turbo-2024-04-09",
          "created": 1712601677,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o",
          "created": 1715367049,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-2024-05-13",
          "created": 1715368132,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-mini-2024-07-18",
          "created": 1721172717,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-mini",
          "created": 1721172741,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-2024-08-06",
          "created": 1722814719,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "chatgpt-4o-latest",
          "created": 1723515131,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "o1-mini-2024-09-12",
          "created": 1725648979,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "o1-mini",
          "created": 1725649008,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-realtime-preview-2024-10-01",
          "created": 1727131766,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-audio-preview-2024-10-01",
          "created": 1727389042,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-audio-preview",
          "created": 1727460443,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-realtime-preview",
          "created": 1727659998,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "omni-moderation-latest",
          "created": 1731689265,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "omni-moderation-2024-09-26",
          "created": 1732734466,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-realtime-preview-2024-12-17",
          "created": 1733945430,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-audio-preview-2024-12-17",
          "created": 1734034239,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-mini-realtime-preview-2024-12-17",
          "created": 1734112601,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-mini-audio-preview-2024-12-17",
          "created": 1734115920,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "o1-2024-12-17",
          "created": 1734326976,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "o1",
          "created": 1734375816,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-mini-realtime-preview",
          "created": 1734387380,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-mini-audio-preview",
          "created": 1734387424,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "o3-mini",
          "created": 1737146383,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "o3-mini-2025-01-31",
          "created": 1738010200,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-2024-11-20",
          "created": 1739331543,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-search-preview-2025-03-11",
          "created": 1741388170,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-search-preview",
          "created": 1741388720,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-mini-search-preview-2025-03-11",
          "created": 1741390858,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-mini-search-preview",
          "created": 1741391161,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-transcribe",
          "created": 1742068463,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-mini-transcribe",
          "created": 1742068596,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "o1-pro-2025-03-19",
          "created": 1742251504,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "o1-pro",
          "created": 1742251791,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-mini-tts",
          "created": 1742403959,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "o3-2025-04-16",
          "created": 1744133301,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "o4-mini-2025-04-16",
          "created": 1744133506,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "o3",
          "created": 1744225308,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "o4-mini",
          "created": 1744225351,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4.1-2025-04-14",
          "created": 1744315746,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4.1",
          "created": 1744316542,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4.1-mini-2025-04-14",
          "created": 1744317547,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4.1-mini",
          "created": 1744318173,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4.1-nano-2025-04-14",
          "created": 1744321025,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4.1-nano",
          "created": 1744321707,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-image-1",
          "created": 1745517030,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "codex-mini-latest",
          "created": 1746673257,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "o3-pro",
          "created": 1748475349,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-realtime-preview-2025-06-03",
          "created": 1748907838,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-4o-audio-preview-2025-06-03",
          "created": 1748908498,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "o3-pro-2025-06-10",
          "created": 1749166761,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "o4-mini-deep-research",
          "created": 1749685485,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "o3-deep-research",
          "created": 1749840121,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "o3-deep-research-2025-06-26",
          "created": 1750865219,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "o4-mini-deep-research-2025-06-26",
          "created": 1750866121,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-5-chat-latest",
          "created": 1754073306,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-5-2025-08-07",
          "created": 1754075360,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-5",
          "created": 1754425777,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-5-mini-2025-08-07",
          "created": 1754425867,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-5-mini",
          "created": 1754425928,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-5-nano-2025-08-07",
          "created": 1754426303,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-5-nano",
          "created": 1754426384,
          "object": "model",
          "owned_by": "system"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "gpt-3.5-turbo-16k",
          "created": 1683758102,
          "object": "model",
          "owned_by": "openai-internal"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "tts-1",
          "created": 1681940951,
          "object": "model",
          "owned_by": "openai-internal"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "whisper-1",
          "created": 1677532384,
          "object": "model",
          "owned_by": "openai-internal"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "text-embedding-ada-002",
          "created": 1671217299,
          "object": "model",
          "owned_by": "openai-internal"
        }
      }
    ],
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/models-bd032f995f2a-e660ee4a.json
+++ b/tests/integration/recordings/responses/models-bd032f995f2a-e660ee4a.json
@ -0,0 +1,96 @@
 {
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "nomic-embed-text:latest",
          "created": 1756922046,
          "object": "model",
          "owned_by": "library"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "all-minilm:l6-v2",
          "created": 1756919946,
          "object": "model",
          "owned_by": "library"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "llama3.2-vision:11b",
          "created": 1753926302,
          "object": "model",
          "owned_by": "library"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "llama3.2-vision:latest",
          "created": 1753845527,
          "object": "model",
          "owned_by": "library"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "llama-guard3:1b",
          "created": 1753479584,
          "object": "model",
          "owned_by": "library"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "llama3.2:1b",
          "created": 1752814944,
          "object": "model",
          "owned_by": "library"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "all-minilm:latest",
          "created": 1748994610,
          "object": "model",
          "owned_by": "library"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "llama3.2:3b",
          "created": 1746123323,
          "object": "model",
          "owned_by": "library"
        }
      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "llama3.2:3b-instruct-fp16",
          "created": 1746052428,
          "object": "model",
          "owned_by": "library"
        }
      }
    ],
    "is_streaming": false
  }
 }
--- a/tests/integration/suites.py
+++ b/tests/integration/suites.py
@ -127,9 +127,8 @@ SETUP_DEFINITIONS: dict[str, Setup] = {
        name="fireworks",
        description="Fireworks provider with a text model",
        defaults={
-            "text_model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+            "text_model": "fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct",
-            "vision_model": "accounts/fireworks/models/llama-v3p2-90b-vision-instruct",
+            "embedding_model": "fireworks/accounts/fireworks/models/qwen3-embedding-8b",
            "embedding_model": "nomic-ai/nomic-embed-text-v1.5",
        },
    ),
 }
--- a/tests/integration/telemetry/test_telemetry.py
+++ b/tests/integration/telemetry/test_telemetry.py
@ -32,8 +32,8 @@ def setup_telemetry_data(llama_stack_client, text_model_id):
        )
    for i in range(2):
-        llama_stack_client.inference.chat_completion(
+        llama_stack_client.chat.completions.create(
-            model_id=text_model_id, messages=[{"role": "user", "content": f"Test trace {i}"}]
+            model=text_model_id, messages=[{"role": "user", "content": f"Test trace {i}"}]
        )
    start_time = time.time()
--- a/tests/integration/test_cases/inference/chat_completion.json
+++ b/tests/integration/test_cases/inference/chat_completion.json
@ -83,13 +83,20 @@
      ],
      "tools": [
        {
-          "tool_name": "get_weather",
+          "type": "function",
          "function": {
            "name": "get_weather",
            "description": "Get the current weather",
            "parameters": {
              "type": "object",
              "properties": {
                "location": {
-              "param_type": "string",
+                  "type": "string",
                  "description": "The city and state (both required), e.g. San Francisco, CA."
                }
              },
              "required": ["location"]
            }
          }
        }
      ],
@ -116,13 +123,20 @@
      ],
      "tools": [
        {
-          "tool_name": "get_weather",
+          "type": "function",
          "function": {
            "name": "get_weather",
            "description": "Get the current weather",
            "parameters": {
              "type": "object",
              "properties": {
                "location": {
-              "param_type": "string",
+                  "type": "string",
                  "description": "The city and state (both required), e.g. San Francisco, CA."
                }
              },
              "required": ["location"]
            }
          }
        }
      ],
@ -162,13 +176,20 @@
      ],
      "tools": [
        {
-          "tool_name": "get_weather",
+          "type": "function",
          "function": {
            "name": "get_weather",
            "description": "Get the current weather",
            "parameters": {
              "type": "object",
              "properties": {
                "location": {
-              "param_type": "string",
+                  "type": "string",
                  "description": "The city and state (both required), e.g. San Francisco, CA."
                }
              },
              "required": ["location"]
            }
          }
        }
      ],
@ -192,66 +213,6 @@
      ]
    }
  },
  "array_parameter": {
    "data": {
      "messages": [
        [
          {
            "role": "user",
            "content": "Please add a new product with name 'Widget', price 19.99, in stock, and tags ['new', 'sale'] and give me the product id."
          }
        ]
      ],
      "tools": [
        {
          "tool_name": "addProduct",
          "description": "Get the current weather",
          "parameters": {
            "name": {
              "param_type": "string",
              "description": "Name of the product"
            },
            "price": {
              "param_type": "number",
              "description": "Price of the product"
            },
            "inStock": {
              "param_type": "boolean",
              "description": "Availability status of the product."
            },
            "tags": {
              "param_type": "list[str]",
              "description": "List of product tags"
            }
          }
        }
      ],
      "tool_responses": [
        {
          "response": "{'response': 'Successfully added product with id: 123'}"
        }
      ],
      "expected": [
        {
          "num_tool_calls": 1,
          "tool_name": "addProduct",
          "tool_arguments": {
            "name": "Widget",
            "price": 19.99,
            "inStock": true,
            "tags": [
              "new",
              "sale"
            ]
          }
        },
        {
          "num_tool_calls": 0,
          "answer": "123"
        }
      ]
    }
  },
  "sample_messages_tool_calling": {
    "data": {
      "messages": [
@ -270,13 +231,19 @@
      ],
      "tools": [
        {
-          "tool_name": "get_weather",
+          "type": "function",
          "function": {
            "name": "get_weather",
            "description": "Get the current weather",
            "parameters": {
              "type": "object",
              "properties": {
                "location": {
-              "param_type": "string",
+                  "type": "string",
-              "description": "The city and state, e.g. San Francisco, CA",
+                  "description": "The city and state (both required), e.g. San Francisco, CA."
-              "required": true
+                }
              },
              "required": ["location"]
            }
          }
        }
@ -343,18 +310,23 @@
      ],
      "tools": [
        {
-          "tool_name": "get_object_namespace_list",
+          "type": "function",
          "function": {
            "name": "get_object_namespace_list",
            "description": "Get the list of objects in a namespace",
            "parameters": {
              "type": "object",
              "properties": {
                "kind": {
-              "param_type": "string",
+                  "type": "string",
-              "description": "the type of object",
+                  "description": "the type of object"
              "required": true
                },
                "namespace": {
-              "param_type": "string",
+                  "type": "string",
-              "description": "the name of the namespace",
+                  "description": "the name of the namespace"
-              "required": true
+                }
              },
              "required": ["kind", "namespace"]
            }
          }
        }
--- a/tests/integration/tool_runtime/test_mcp.py
+++ b/tests/integration/tool_runtime/test_mcp.py
@ -31,6 +31,11 @@ def test_mcp_invocation(llama_stack_client, text_model_id, mcp_server):
    uri = mcp_server["server_url"]
    # registering should not raise an error anymore even if you don't specify the auth token
    try:
        llama_stack_client.toolgroups.unregister(toolgroup_id=test_toolgroup_id)
    except Exception:
        pass
    llama_stack_client.toolgroups.register(
        toolgroup_id=test_toolgroup_id,
        provider_id="model-context-protocol",
--- a/tests/unit/providers/agent/test_get_raw_document_text.py
+++ b/tests/unit/providers/agent/test_get_raw_document_text.py
@ -107,14 +107,34 @@ async def test_get_raw_document_text_deprecated_text_yaml_with_text_content_item
        assert "text/yaml" in str(w[0].message)
 async def test_get_raw_document_text_supports_json_mime_type():
    """Test that the function accepts application/json mime type."""
    json_content = '{"name": "test", "version": "1.0", "items": ["item1", "item2"]}'
    document = Document(content=json_content, mime_type="application/json")
    result = await get_raw_document_text(document)
    assert result == json_content
 async def test_get_raw_document_text_with_json_text_content_item():
    """Test that the function handles JSON TextContentItem correctly."""
    json_content = '{"key": "value", "nested": {"array": [1, 2, 3]}}'
    document = Document(content=TextContentItem(text=json_content), mime_type="application/json")
    result = await get_raw_document_text(document)
    assert result == json_content
 async def test_get_raw_document_text_rejects_unsupported_mime_types():
    """Test that the function rejects unsupported mime types."""
    document = Document(
        content="Some content",
-        mime_type="application/json",  # Not supported
+        mime_type="application/pdf",  # Not supported
    )
-    with pytest.raises(ValueError, match="Unexpected document mime type: application/json"):
+    with pytest.raises(ValueError, match="Unexpected document mime type: application/pdf"):
        await get_raw_document_text(document)
--- a/tests/unit/providers/agent/test_meta_reference_agent.py
+++ b/tests/unit/providers/agent/test_meta_reference_agent.py
@ -16,9 +16,11 @@ from llama_stack.apis.agents import (
 )
 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.resource import ResourceType
 from llama_stack.apis.safety import Safety
-from llama_stack.apis.tools import ToolGroups, ToolRuntime
+from llama_stack.apis.tools import ListToolsResponse, Tool, ToolGroups, ToolParameter, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.providers.inline.agents.meta_reference.agent_instance import ChatAgent
 from llama_stack.providers.inline.agents.meta_reference.agents import MetaReferenceAgentsImpl
 from llama_stack.providers.inline.agents.meta_reference.config import MetaReferenceAgentsImplConfig
 from llama_stack.providers.inline.agents.meta_reference.persistence import AgentInfo
@ -75,11 +77,11 @@ def sample_agent_config():
        },
        input_shields=["string"],
        output_shields=["string"],
-        toolgroups=["string"],
+        toolgroups=["mcp::my_mcp_server"],
        client_tools=[
            {
-                "name": "string",
+                "name": "client_tool",
-                "description": "string",
+                "description": "Client Tool",
                "parameters": [
                    {
                        "name": "string",
@ -226,3 +228,83 @@ async def test_delete_agent(agents_impl, sample_agent_config):
    # Verify the agent was deleted
    with pytest.raises(ValueError):
        await agents_impl.get_agent(agent_id)
 async def test__initialize_tools(agents_impl, sample_agent_config):
    # Mock tool_groups_api.list_tools()
    agents_impl.tool_groups_api.list_tools.return_value = ListToolsResponse(
        data=[
            Tool(
                identifier="story_maker",
                provider_id="model-context-protocol",
                type=ResourceType.tool,
                toolgroup_id="mcp::my_mcp_server",
                description="Make a story",
                parameters=[
                    ToolParameter(
                        name="story_title",
                        parameter_type="string",
                        description="Title of the story",
                        required=True,
                        title="Story Title",
                    ),
                    ToolParameter(
                        name="input_words",
                        parameter_type="array",
                        description="Input words",
                        required=False,
                        items={"type": "string"},
                        title="Input Words",
                        default=[],
                    ),
                ],
            )
        ]
    )
    create_response = await agents_impl.create_agent(sample_agent_config)
    agent_id = create_response.agent_id
    # Get an instance of ChatAgent
    chat_agent = await agents_impl._get_agent_impl(agent_id)
    assert chat_agent is not None
    assert isinstance(chat_agent, ChatAgent)
    # Initialize tool definitions
    await chat_agent._initialize_tools()
    assert len(chat_agent.tool_defs) == 2
    # Verify the first tool, which is a client tool
    first_tool = chat_agent.tool_defs[0]
    assert first_tool.tool_name == "client_tool"
    assert first_tool.description == "Client Tool"
    # Verify the second tool, which is an MCP tool that has an array-type property
    second_tool = chat_agent.tool_defs[1]
    assert second_tool.tool_name == "story_maker"
    assert second_tool.description == "Make a story"
    parameters = second_tool.parameters
    assert len(parameters) == 2
    # Verify a string property
    story_title = parameters.get("story_title")
    assert story_title is not None
    assert story_title.param_type == "string"
    assert story_title.description == "Title of the story"
    assert story_title.required
    assert story_title.items is None
    assert story_title.title == "Story Title"
    assert story_title.default is None
    # Verify an array property
    input_words = parameters.get("input_words")
    assert input_words is not None
    assert input_words.param_type == "array"
    assert input_words.description == "Input words"
    assert not input_words.required
    assert input_words.items is not None
    assert len(input_words.items) == 1
    assert input_words.items.get("type") == "string"
    assert input_words.title == "Input Words"
    assert input_words.default == []
--- a/tests/unit/providers/agents/meta_reference/responses/init.py
+++ b/tests/unit/providers/agents/meta_reference/responses/init.py
@ -3,5 +3,3 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .batch_inference import *
--- a/tests/unit/providers/agents/meta_reference/responses/test_streaming.py
+++ b/tests/unit/providers/agents/meta_reference/responses/test_streaming.py
@ -0,0 +1,147 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 """
 Unit tests for MCP tool parameter conversion in streaming responses.
 This tests the fix for handling array-type parameters with 'items' field
 when converting MCP tool definitions to OpenAI format.
 """
 from llama_stack.apis.tools import ToolDef, ToolParameter
 from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
 from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
 def test_mcp_tool_conversion_with_array_items():
    """
    Test that MCP tool parameters with array type and items field are properly converted.
    This is a regression test for the bug where array parameters without 'items'
    caused OpenAI API validation errors like:
    "Invalid schema for function 'pods_exec': In context=('properties', 'command'),
    array schema missing items."
    """
    # Create a tool parameter with array type and items specification
    # This mimics what kubernetes-mcp-server's pods_exec tool has
    tool_param = ToolParameter(
        name="command",
        parameter_type="array",
        description="Command to execute in the pod",
        required=True,
        items={"type": "string"},  # This is the crucial field
    )
    # Convert to ToolDefinition format (as done in streaming.py)
    tool_def = ToolDefinition(
        tool_name="test_tool",
        description="Test tool with array parameter",
        parameters={
            "command": ToolParamDefinition(
                param_type=tool_param.parameter_type,
                description=tool_param.description,
                required=tool_param.required,
                default=tool_param.default,
                items=tool_param.items,  # The fix: ensure items is passed through
            )
        },
    )
    # Convert to OpenAI format
    openai_tool = convert_tooldef_to_openai_tool(tool_def)
    # Verify the conversion includes the items field
    assert openai_tool["type"] == "function"
    assert openai_tool["function"]["name"] == "test_tool"
    assert "parameters" in openai_tool["function"]
    parameters = openai_tool["function"]["parameters"]
    assert "properties" in parameters
    assert "command" in parameters["properties"]
    command_param = parameters["properties"]["command"]
    assert command_param["type"] == "array"
    assert "items" in command_param, "Array parameter must have 'items' field for OpenAI API"
    assert command_param["items"] == {"type": "string"}
 def test_mcp_tool_conversion_without_array():
    """Test that non-array parameters work correctly without items field."""
    tool_param = ToolParameter(
        name="name",
        parameter_type="string",
        description="Name parameter",
        required=True,
    )
    tool_def = ToolDefinition(
        tool_name="test_tool",
        description="Test tool with string parameter",
        parameters={
            "name": ToolParamDefinition(
                param_type=tool_param.parameter_type,
                description=tool_param.description,
                required=tool_param.required,
                items=tool_param.items,  # Will be None for non-array types
            )
        },
    )
    openai_tool = convert_tooldef_to_openai_tool(tool_def)
    # Verify basic structure
    assert openai_tool["type"] == "function"
    parameters = openai_tool["function"]["parameters"]
    assert "name" in parameters["properties"]
    name_param = parameters["properties"]["name"]
    assert name_param["type"] == "string"
    # items should not be present for non-array types
    assert "items" not in name_param or name_param.get("items") is None
 def test_mcp_tool_conversion_complex_array_items():
    """Test array parameter with complex items schema (object type)."""
    tool_param = ToolParameter(
        name="configs",
        parameter_type="array",
        description="Array of configuration objects",
        required=False,
        items={
            "type": "object",
            "properties": {
                "key": {"type": "string"},
                "value": {"type": "string"},
            },
            "required": ["key"],
        },
    )
    tool_def = ToolDefinition(
        tool_name="test_tool",
        description="Test tool with complex array parameter",
        parameters={
            "configs": ToolParamDefinition(
                param_type=tool_param.parameter_type,
                description=tool_param.description,
                required=tool_param.required,
                items=tool_param.items,
            )
        },
    )
    openai_tool = convert_tooldef_to_openai_tool(tool_def)
    # Verify complex items schema is preserved
    parameters = openai_tool["function"]["parameters"]
    configs_param = parameters["properties"]["configs"]
    assert configs_param["type"] == "array"
    assert "items" in configs_param
    assert configs_param["items"]["type"] == "object"
    assert "properties" in configs_param["items"]
    assert "key" in configs_param["items"]["properties"]
    assert "value" in configs_param["items"]["properties"]
--- a/tests/unit/providers/utils/inference/test_openai_mixin.py
+++ b/tests/unit/providers/utils/inference/test_openai_mixin.py
@ -4,11 +4,11 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from unittest.mock import MagicMock, PropertyMock, patch
+from unittest.mock import AsyncMock, MagicMock, PropertyMock, patch
 import pytest
-from llama_stack.apis.inference import Model
+from llama_stack.apis.inference import Model, OpenAIUserMessageParam
 from llama_stack.apis.models import ModelType
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
@ -43,8 +43,17 @@ class OpenAIMixinWithEmbeddingsImpl(OpenAIMixin):
@pytest.fixture
 def mixin():
-    """Create a test instance of OpenAIMixin"""
+    """Create a test instance of OpenAIMixin with mocked model_store"""
-    return OpenAIMixinImpl()
+    mixin_instance = OpenAIMixinImpl()
    # just enough to satisfy _get_provider_model_id calls
    mock_model_store = MagicMock()
    mock_model = MagicMock()
    mock_model.provider_resource_id = "test-provider-resource-id"
    mock_model_store.get_model = AsyncMock(return_value=mock_model)
    mixin_instance.model_store = mock_model_store
    return mixin_instance
@pytest.fixture
@ -205,6 +214,74 @@ class TestOpenAIMixinCacheBehavior:
            assert "final-mock-model-id" in mixin._model_cache
 class TestOpenAIMixinImagePreprocessing:
    """Test cases for image preprocessing functionality"""
    async def test_openai_chat_completion_with_image_preprocessing_enabled(self, mixin):
        """Test that image URLs are converted to base64 when download_images is True"""
        mixin.download_images = True
        message = OpenAIUserMessageParam(
            role="user",
            content=[
                {"type": "text", "text": "What's in this image?"},
                {"type": "image_url", "image_url": {"url": "http://example.com/image.jpg"}},
            ],
        )
        mock_client = MagicMock()
        mock_response = MagicMock()
        mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
        with patch.object(type(mixin), "client", new_callable=PropertyMock, return_value=mock_client):
            with patch("llama_stack.providers.utils.inference.openai_mixin.localize_image_content") as mock_localize:
                mock_localize.return_value = (b"fake_image_data", "jpeg")
                await mixin.openai_chat_completion(model="test-model", messages=[message])
            mock_localize.assert_called_once_with("http://example.com/image.jpg")
            mock_client.chat.completions.create.assert_called_once()
            call_args = mock_client.chat.completions.create.call_args
            processed_messages = call_args[1]["messages"]
            assert len(processed_messages) == 1
            content = processed_messages[0]["content"]
            assert len(content) == 2
            assert content[0]["type"] == "text"
            assert content[1]["type"] == "image_url"
            assert content[1]["image_url"]["url"] == "data:image/jpeg;base64,ZmFrZV9pbWFnZV9kYXRh"
    async def test_openai_chat_completion_with_image_preprocessing_disabled(self, mixin):
        """Test that image URLs are not modified when download_images is False"""
        mixin.download_images = False  # explicitly set to False
        message = OpenAIUserMessageParam(
            role="user",
            content=[
                {"type": "text", "text": "What's in this image?"},
                {"type": "image_url", "image_url": {"url": "http://example.com/image.jpg"}},
            ],
        )
        mock_client = MagicMock()
        mock_response = MagicMock()
        mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
        with patch.object(type(mixin), "client", new_callable=PropertyMock, return_value=mock_client):
            with patch("llama_stack.providers.utils.inference.openai_mixin.localize_image_content") as mock_localize:
                await mixin.openai_chat_completion(model="test-model", messages=[message])
            mock_localize.assert_not_called()
            mock_client.chat.completions.create.assert_called_once()
            call_args = mock_client.chat.completions.create.call_args
            processed_messages = call_args[1]["messages"]
            assert len(processed_messages) == 1
            content = processed_messages[0]["content"]
            assert len(content) == 2
            assert content[1]["image_url"]["url"] == "http://example.com/image.jpg"
 class TestOpenAIMixinEmbeddingModelMetadata:
    """Test cases for embedding_model_metadata attribute functionality"""