Merge 33d27393f4 into sapling-pr-archive-ehhuang

2025-12-04 02:03:44 +00:00 · 2025-10-15 14:19:34 -07:00 · 2025-10-15 14:19:34 -07:00 · 460097bd7b
commit 460097bd7b
parent 980e46d1f7 33d27393f4
37 changed files with 10737 additions and 1079 deletions
--- a/docs/docs/building_applications/telemetry.mdx
+++ b/docs/docs/building_applications/telemetry.mdx
@ -10,58 +10,8 @@ import TabItem from '@theme/TabItem';

 # Telemetry

-The Llama Stack telemetry system provides comprehensive tracing, metrics, and logging capabilities. It supports multiple sink types including OpenTelemetry, SQLite, and Console output for complete observability of your AI applications.
+The Llama Stack uses OpenTelemetry to provide comprehensive tracing, metrics, and logging capabilities.

-## Event Types
-
-The telemetry system supports three main types of events:
-
-<Tabs>
-<TabItem value="unstructured" label="Unstructured Logs">
-
-Free-form log messages with severity levels for general application logging:
-
-```python
-unstructured_log_event = UnstructuredLogEvent(
-    message="This is a log message",
-    severity=LogSeverity.INFO
-)
-```
-
-</TabItem>
-<TabItem value="metrics" label="Metric Events">
-
-Numerical measurements with units for tracking performance and usage:
-
-```python
-metric_event = MetricEvent(
-    metric="my_metric",
-    value=10,
-    unit="count"
-)
-```
-
-</TabItem>
-<TabItem value="structured" label="Structured Logs">
-
-System events like span start/end that provide structured operation tracking:
-
-```python
-structured_log_event = SpanStartPayload(
-    name="my_span",
-    parent_span_id="parent_span_id"
-)
-```
-
-</TabItem>
-</Tabs>
-
-## Spans and Traces
-
- **Spans**: Represent individual operations with timing information and hierarchical relationships
- **Traces**: Collections of related spans that form a complete request flow across your application
-
-This hierarchical structure allows you to understand the complete execution path of requests through your Llama Stack application.

 ## Automatic Metrics Generation

@ -129,21 +79,6 @@ Send events to an OpenTelemetry Collector for integration with observability pla
 - Compatible with all OpenTelemetry collectors
 - Supports both traces and metrics

-</TabItem>
-<TabItem value="sqlite" label="SQLite">
-
-Store events in a local SQLite database for direct querying:
-
-**Use Cases:**
- Local development and debugging
- Custom analytics and reporting
- Offline analysis of application behavior
-
-**Features:**
- Direct SQL querying capabilities
- Persistent local storage
- No external dependencies
-
 </TabItem>
 <TabItem value="console" label="Console">

@ -174,9 +109,8 @@ telemetry:
    provider_type: inline::meta-reference
    config:
      service_name: "llama-stack-service"
-      sinks: ['console', 'sqlite', 'otel_trace', 'otel_metric']
+      sinks: ['console', 'otel_trace', 'otel_metric']
      otel_exporter_otlp_endpoint: "http://localhost:4318"
-      sqlite_db_path: "/path/to/telemetry.db"
 ```

 ### Environment Variables
@ -185,7 +119,7 @@ Configure telemetry behavior using environment variables:

 - **`OTEL_EXPORTER_OTLP_ENDPOINT`**: OpenTelemetry Collector endpoint (default: `http://localhost:4318`)
 - **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string)
- **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `console,sqlite`)
+- **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `[]`)

 ### Quick Setup: Complete Telemetry Stack

@ -248,37 +182,10 @@ Forward metrics to other observability systems:
 </TabItem>
 </Tabs>

-## SQLite Querying
-
-The `sqlite` sink allows you to query traces without an external system. This is particularly useful for development and custom analytics.
-
-### Example Queries
-
-```sql
-- Query recent traces
-SELECT * FROM traces WHERE timestamp > datetime('now', '-1 hour');
-
-- Analyze span durations
-SELECT name, AVG(duration_ms) as avg_duration
-FROM spans
-GROUP BY name
-ORDER BY avg_duration DESC;
-
-- Find slow operations
-SELECT * FROM spans
-WHERE duration_ms > 1000
-ORDER BY duration_ms DESC;
-```
-
-:::tip[Advanced Analytics]
-Refer to the [Getting Started notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) for more examples on querying traces and spans programmatically.
-:::
-
 ## Best Practices

 ### 🔍 **Monitoring Strategy**
 - Use OpenTelemetry for production environments
- Combine multiple sinks for development (console + SQLite)
 - Set up alerts on key metrics like token usage and error rates

 ### 📊 **Metrics Analysis**
@ -293,45 +200,8 @@ Refer to the [Getting Started notebook](https://github.com/meta-llama/llama-stac

 ### 🔧 **Configuration Management**
 - Use environment variables for flexible deployment
- Configure appropriate retention policies for SQLite
 - Ensure proper network access to OpenTelemetry collectors

-## Integration Examples
-
-### Basic Telemetry Setup
-
-```python
-from llama_stack_client import LlamaStackClient
-
-# Client with telemetry headers
-client = LlamaStackClient(
-    base_url="http://localhost:8000",
-    extra_headers={
-        "X-Telemetry-Service": "my-ai-app",
-        "X-Telemetry-Version": "1.0.0"
-    }
-)
-
-# All API calls will be automatically traced
-response = client.chat.completions.create(
-    model="meta-llama/Llama-3.2-3B-Instruct",
-    messages=[{"role": "user", "content": "Hello!"}]
-)
-```
-
-### Custom Telemetry Context
-
-```python
-# Add custom span attributes for better tracking
-with tracer.start_as_current_span("custom_operation") as span:
-    span.set_attribute("user_id", "user123")
-    span.set_attribute("operation_type", "chat_completion")
-
-    response = client.chat.completions.create(
-        model="meta-llama/Llama-3.2-3B-Instruct",
-        messages=[{"role": "user", "content": "Hello!"}]
-    )
-```

 ## Related Resources

--- a/docs/docs/distributions/self_hosted_distro/starter.md
+++ b/docs/docs/distributions/self_hosted_distro/starter.md
@ -119,7 +119,7 @@ The following environment variables can be configured:

 ### Telemetry Configuration
 - `OTEL_SERVICE_NAME`: OpenTelemetry service name
- `TELEMETRY_SINKS`: Telemetry sinks (default: `console,sqlite`)
+- `TELEMETRY_SINKS`: Telemetry sinks (default: `[]`)

 ## Enabling Providers

@ -216,7 +216,6 @@ The starter distribution uses SQLite for local storage of various components:
 - **Files metadata**: `~/.llama/distributions/starter/files_metadata.db`
 - **Agents store**: `~/.llama/distributions/starter/agents_store.db`
 - **Responses store**: `~/.llama/distributions/starter/responses_store.db`
- **Trace store**: `~/.llama/distributions/starter/trace_store.db`
 - **Evaluation store**: `~/.llama/distributions/starter/meta_reference_eval.db`
 - **Dataset I/O stores**: Various HuggingFace and local filesystem stores

--- a/docs/docs/providers/telemetry/inline_meta-reference.mdx
+++ b/docs/docs/providers/telemetry/inline_meta-reference.mdx
@ -16,14 +16,12 @@ Meta's reference implementation of telemetry and observability using OpenTelemet
 |-------|------|----------|---------|-------------|
 | `otel_exporter_otlp_endpoint` | `str \| None` | No |  | The OpenTelemetry collector endpoint URL (base URL for traces, metrics, and logs). If not set, the SDK will use OTEL_EXPORTER_OTLP_ENDPOINT environment variable. |
 | `service_name` | `<class 'str'>` | No |  | The service name to use for telemetry |
-| `sinks` | `list[inline.telemetry.meta_reference.config.TelemetrySink` | No | [&lt;TelemetrySink.SQLITE: 'sqlite'&gt;] | List of telemetry sinks to enable (possible values: otel_trace, otel_metric, sqlite, console) |
-| `sqlite_db_path` | `<class 'str'>` | No | ~/.llama/runtime/trace_store.db | The path to the SQLite database to use for storing traces |
+| `sinks` | `list[inline.telemetry.meta_reference.config.TelemetrySink` | No | [] | List of telemetry sinks to enable (possible values: otel_trace, otel_metric, console) |

 ## Sample Configuration

 ```yaml
 service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-sinks: ${env.TELEMETRY_SINKS:=sqlite}
-sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/trace_store.db
+sinks: ${env.TELEMETRY_SINKS:=}
 otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
 ```
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@ -421,104 +421,3 @@ class Telemetry(Protocol):
        :param ttl_seconds: The time to live of the event.
        """
        ...
-
-    async def query_traces(
-        self,
-        attribute_filters: list[QueryCondition] | None = None,
-        limit: int | None = 100,
-        offset: int | None = 0,
-        order_by: list[str] | None = None,
-    ) -> QueryTracesResponse:
-        """Query traces.
-
-        :param attribute_filters: The attribute filters to apply to the traces.
-        :param limit: The limit of traces to return.
-        :param offset: The offset of the traces to return.
-        :param order_by: The order by of the traces to return.
-        :returns: A QueryTracesResponse.
-        """
-        ...
-
-    async def get_trace(self, trace_id: str) -> Trace:
-        """Get a trace by its ID.
-
-        :param trace_id: The ID of the trace to get.
-        :returns: A Trace.
-        """
-        ...
-
-    async def get_span(self, trace_id: str, span_id: str) -> Span:
-        """Get a span by its ID.
-
-        :param trace_id: The ID of the trace to get the span from.
-        :param span_id: The ID of the span to get.
-        :returns: A Span.
-        """
-        ...
-
-    async def get_span_tree(
-        self,
-        span_id: str,
-        attributes_to_return: list[str] | None = None,
-        max_depth: int | None = None,
-    ) -> QuerySpanTreeResponse:
-        """Get a span tree by its ID.
-
-        :param span_id: The ID of the span to get the tree from.
-        :param attributes_to_return: The attributes to return in the tree.
-        :param max_depth: The maximum depth of the tree.
-        :returns: A QuerySpanTreeResponse.
-        """
-        ...
-
-    async def query_spans(
-        self,
-        attribute_filters: list[QueryCondition],
-        attributes_to_return: list[str],
-        max_depth: int | None = None,
-    ) -> QuerySpansResponse:
-        """Query spans.
-
-        :param attribute_filters: The attribute filters to apply to the spans.
-        :param attributes_to_return: The attributes to return in the spans.
-        :param max_depth: The maximum depth of the tree.
-        :returns: A QuerySpansResponse.
-        """
-        ...
-
-    async def save_spans_to_dataset(
-        self,
-        attribute_filters: list[QueryCondition],
-        attributes_to_save: list[str],
-        dataset_id: str,
-        max_depth: int | None = None,
-    ) -> None:
-        """Save spans to a dataset.
-
-        :param attribute_filters: The attribute filters to apply to the spans.
-        :param attributes_to_save: The attributes to save to the dataset.
-        :param dataset_id: The ID of the dataset to save the spans to.
-        :param max_depth: The maximum depth of the tree.
-        """
-        ...
-
-    async def query_metrics(
-        self,
-        metric_name: str,
-        start_time: int,
-        end_time: int | None = None,
-        granularity: str | None = None,
-        query_type: MetricQueryType = MetricQueryType.RANGE,
-        label_matchers: list[MetricLabelMatcher] | None = None,
-    ) -> QueryMetricsResponse:
-        """Query metrics.
-
-        :param metric_name: The name of the metric to query.
-        :param start_time: The start time of the metric to query.
-        :param end_time: The end time of the metric to query.
-        :param granularity: The granularity of the metric to query.
-        :param query_type: The type of query to perform.
-        :param label_matchers: The label matchers to apply to the metric.
-        :returns: A QueryMetricsResponse.
-        """
-        ...
--- a/llama_stack/apis/vector_io/vector_io.py
+++ b/llama_stack/apis/vector_io/vector_io.py
@ -93,6 +93,22 @@ class Chunk(BaseModel):

        return generate_chunk_id(str(uuid.uuid4()), str(self.content))

+    @property
+    def document_id(self) -> str | None:
+        """Returns the document_id from either metadata or chunk_metadata, with metadata taking precedence."""
+        # Check metadata first (takes precedence)
+        doc_id = self.metadata.get("document_id")
+        if doc_id is not None:
+            if not isinstance(doc_id, str):
+                raise TypeError(f"metadata['document_id'] must be a string, got {type(doc_id).__name__}: {doc_id!r}")
+            return doc_id
+
+        # Fall back to chunk_metadata if available (Pydantic ensures type safety)
+        if self.chunk_metadata is not None:
+            return self.chunk_metadata.document_id
+
+        return None
+

@json_schema_type
 class QueryChunksResponse(BaseModel):
--- a/llama_stack/core/library_client.py
+++ b/llama_stack/core/library_client.py
@ -207,8 +207,9 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        super().__init__()
        # when using the library client, we should not log to console since many
        # of our logs are intended for server-side usage
-        current_sinks = os.environ.get("TELEMETRY_SINKS", "sqlite").split(",")
-        os.environ["TELEMETRY_SINKS"] = ",".join(sink for sink in current_sinks if sink != "console")
+        if sinks_from_env := os.environ.get("TELEMETRY_SINKS", None):
+            current_sinks = sinks_from_env.strip().lower().split(",")
+            os.environ["TELEMETRY_SINKS"] = ",".join(sink for sink in current_sinks if sink != "console")

        if in_notebook():
            import nest_asyncio
--- a/llama_stack/core/routers/vector_io.py
+++ b/llama_stack/core/routers/vector_io.py
@ -93,8 +93,10 @@ class VectorIORouter(VectorIO):
        chunks: list[Chunk],
        ttl_seconds: int | None = None,
    ) -> None:
+        doc_ids = [chunk.document_id for chunk in chunks[:3]]
        logger.debug(
-            f"VectorIORouter.insert_chunks: {vector_db_id}, {len(chunks)} chunks, ttl_seconds={ttl_seconds}, chunk_ids={[chunk.metadata['document_id'] for chunk in chunks[:3]]}{' and more...' if len(chunks) > 3 else ''}",
+            f"VectorIORouter.insert_chunks: {vector_db_id}, {len(chunks)} chunks, "
+            f"ttl_seconds={ttl_seconds}, chunk_ids={doc_ids}{' and more...' if len(chunks) > 3 else ''}"
        )
        provider = await self.routing_table.get_provider_impl(vector_db_id)
        return await provider.insert_chunks(vector_db_id, chunks, ttl_seconds)
--- a/llama_stack/distributions/ci-tests/run.yaml
+++ b/llama_stack/distributions/ci-tests/run.yaml
@ -159,8 +159,7 @@ providers:
    provider_type: inline::meta-reference
    config:
      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/trace_store.db
+      sinks: ${env.TELEMETRY_SINKS:=}
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
  post_training:
  - provider_id: torchtune-cpu
--- a/llama_stack/distributions/dell/run-with-safety.yaml
+++ b/llama_stack/distributions/dell/run-with-safety.yaml
@ -50,8 +50,7 @@ providers:
    provider_type: inline::meta-reference
    config:
      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/trace_store.db
+      sinks: ${env.TELEMETRY_SINKS:=}
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
  eval:
  - provider_id: meta-reference
--- a/llama_stack/distributions/dell/run.yaml
+++ b/llama_stack/distributions/dell/run.yaml
@ -46,8 +46,7 @@ providers:
    provider_type: inline::meta-reference
    config:
      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/trace_store.db
+      sinks: ${env.TELEMETRY_SINKS:=}
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
  eval:
  - provider_id: meta-reference
--- a/llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml
@ -61,8 +61,7 @@ providers:
    provider_type: inline::meta-reference
    config:
      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/trace_store.db
+      sinks: ${env.TELEMETRY_SINKS:=}
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
  eval:
  - provider_id: meta-reference
--- a/llama_stack/distributions/meta-reference-gpu/run.yaml
+++ b/llama_stack/distributions/meta-reference-gpu/run.yaml
@ -51,8 +51,7 @@ providers:
    provider_type: inline::meta-reference
    config:
      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/trace_store.db
+      sinks: ${env.TELEMETRY_SINKS:=}
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
  eval:
  - provider_id: meta-reference
--- a/llama_stack/distributions/nvidia/run-with-safety.yaml
+++ b/llama_stack/distributions/nvidia/run-with-safety.yaml
@ -53,8 +53,7 @@ providers:
    provider_type: inline::meta-reference
    config:
      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/trace_store.db
+      sinks: ${env.TELEMETRY_SINKS:=}
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
  eval:
  - provider_id: nvidia
--- a/llama_stack/distributions/nvidia/run.yaml
+++ b/llama_stack/distributions/nvidia/run.yaml
@ -48,8 +48,7 @@ providers:
    provider_type: inline::meta-reference
    config:
      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/trace_store.db
+      sinks: ${env.TELEMETRY_SINKS:=}
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
  eval:
  - provider_id: nvidia
--- a/llama_stack/distributions/open-benchmark/run.yaml
+++ b/llama_stack/distributions/open-benchmark/run.yaml
@ -81,8 +81,7 @@ providers:
    provider_type: inline::meta-reference
    config:
      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/open-benchmark}/trace_store.db
+      sinks: ${env.TELEMETRY_SINKS:=}
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
  eval:
  - provider_id: meta-reference
--- a/llama_stack/distributions/starter-gpu/run.yaml
+++ b/llama_stack/distributions/starter-gpu/run.yaml
@ -159,8 +159,7 @@ providers:
    provider_type: inline::meta-reference
    config:
      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/trace_store.db
+      sinks: ${env.TELEMETRY_SINKS:=}
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
  post_training:
  - provider_id: huggingface-gpu
--- a/llama_stack/distributions/starter/run.yaml
+++ b/llama_stack/distributions/starter/run.yaml
@ -159,8 +159,7 @@ providers:
    provider_type: inline::meta-reference
    config:
      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/trace_store.db
+      sinks: ${env.TELEMETRY_SINKS:=}
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
  post_training:
  - provider_id: torchtune-cpu
--- a/llama_stack/distributions/watsonx/run.yaml
+++ b/llama_stack/distributions/watsonx/run.yaml
@ -46,8 +46,7 @@ providers:
    provider_type: inline::meta-reference
    config:
      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/watsonx}/trace_store.db
+      sinks: ${env.TELEMETRY_SINKS:=}
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
  eval:
  - provider_id: meta-reference
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@ -44,8 +44,11 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseObjectStreamResponseRefusalDone,
    OpenAIResponseOutput,
    OpenAIResponseOutputMessageContentOutputText,
+    OpenAIResponseOutputMessageFileSearchToolCall,
    OpenAIResponseOutputMessageFunctionToolCall,
+    OpenAIResponseOutputMessageMCPCall,
    OpenAIResponseOutputMessageMCPListTools,
+    OpenAIResponseOutputMessageWebSearchToolCall,
    OpenAIResponseText,
    OpenAIResponseUsage,
    OpenAIResponseUsageInputTokensDetails,
@ -177,6 +180,7 @@ class StreamingResponseOrchestrator:
                # (some providers don't support non-empty response_format when tools are present)
                response_format = None if self.ctx.response_format.type == "text" else self.ctx.response_format
                logger.debug(f"calling openai_chat_completion with tools: {self.ctx.chat_tools}")
+
                params = OpenAIChatCompletionRequestWithExtraBody(
                    model=self.ctx.model,
                    messages=messages,
@ -613,19 +617,22 @@ class StreamingResponseOrchestrator:

                            # Emit output_item.added event for the new function call
                            self.sequence_number += 1
-                            function_call_item = OpenAIResponseOutputMessageFunctionToolCall(
-                                arguments="",  # Will be filled incrementally via delta events
-                                call_id=tool_call.id or "",
-                                name=tool_call.function.name if tool_call.function else "",
-                                id=tool_call_item_id,
-                                status="in_progress",
-                            )
-                            yield OpenAIResponseObjectStreamResponseOutputItemAdded(
-                                response_id=self.response_id,
-                                item=function_call_item,
-                                output_index=len(output_messages),
-                                sequence_number=self.sequence_number,
-                            )
+                            is_mcp_tool = tool_call.function.name and tool_call.function.name in self.mcp_tool_to_server
+                            if not is_mcp_tool and tool_call.function.name not in ["web_search", "knowledge_search"]:
+                                # for MCP tools (and even other non-function tools) we emit an output message item later
+                                function_call_item = OpenAIResponseOutputMessageFunctionToolCall(
+                                    arguments="",  # Will be filled incrementally via delta events
+                                    call_id=tool_call.id or "",
+                                    name=tool_call.function.name if tool_call.function else "",
+                                    id=tool_call_item_id,
+                                    status="in_progress",
+                                )
+                                yield OpenAIResponseObjectStreamResponseOutputItemAdded(
+                                    response_id=self.response_id,
+                                    item=function_call_item,
+                                    output_index=len(output_messages),
+                                    sequence_number=self.sequence_number,
+                                )

                        # Stream tool call arguments as they arrive (differentiate between MCP and function calls)
                        if tool_call.function and tool_call.function.arguments:
@ -806,6 +813,35 @@ class StreamingResponseOrchestrator:
            if not matching_item_id:
                matching_item_id = f"tc_{uuid.uuid4()}"

+            self.sequence_number += 1
+            if tool_call.function.name and tool_call.function.name in self.mcp_tool_to_server:
+                item = OpenAIResponseOutputMessageMCPCall(
+                    arguments="",
+                    name=tool_call.function.name,
+                    id=matching_item_id,
+                    server_label=self.mcp_tool_to_server[tool_call.function.name].server_label,
+                    status="in_progress",
+                )
+            elif tool_call.function.name == "web_search":
+                item = OpenAIResponseOutputMessageWebSearchToolCall(
+                    id=matching_item_id,
+                    status="in_progress",
+                )
+            elif tool_call.function.name == "knowledge_search":
+                item = OpenAIResponseOutputMessageFileSearchToolCall(
+                    id=matching_item_id,
+                    status="in_progress",
+                )
+            else:
+                raise ValueError(f"Unsupported tool call: {tool_call.function.name}")
+
+            yield OpenAIResponseObjectStreamResponseOutputItemAdded(
+                response_id=self.response_id,
+                item=item,
+                output_index=len(output_messages),
+                sequence_number=self.sequence_number,
+            )
+
            # Execute tool call with streaming
            tool_call_log = None
            tool_response_message = None
@ -1064,7 +1100,11 @@ class StreamingResponseOrchestrator:
        self.sequence_number += 1
        yield OpenAIResponseObjectStreamResponseOutputItemAdded(
            response_id=self.response_id,
-            item=mcp_list_message,
+            item=OpenAIResponseOutputMessageMCPListTools(
+                id=mcp_list_message.id,
+                server_label=mcp_list_message.server_label,
+                tools=[],
+            ),
            output_index=len(output_messages) - 1,
            sequence_number=self.sequence_number,
        )
--- a/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py
@ -93,7 +93,7 @@ class ToolExecutor:

        # Build result messages from tool execution
        output_message, input_message = await self._build_result_messages(
-            function, tool_call_id, tool_kwargs, ctx, error_exc, result, has_error, mcp_tool_to_server
+            function, tool_call_id, item_id, tool_kwargs, ctx, error_exc, result, has_error, mcp_tool_to_server
        )

        # Yield the final result
@ -356,6 +356,7 @@ class ToolExecutor:
        self,
        function,
        tool_call_id: str,
+        item_id: str,
        tool_kwargs: dict,
        ctx: ChatCompletionContext,
        error_exc: Exception | None,
@ -375,7 +376,7 @@ class ToolExecutor:
            )

            message = OpenAIResponseOutputMessageMCPCall(
-                id=tool_call_id,
+                id=item_id,
                arguments=function.arguments,
                name=function.name,
                server_label=mcp_tool_to_server[function.name].server_label,
@ -389,14 +390,14 @@ class ToolExecutor:
        else:
            if function.name == "web_search":
                message = OpenAIResponseOutputMessageWebSearchToolCall(
-                    id=tool_call_id,
+                    id=item_id,
                    status="completed",
                )
                if has_error:
                    message.status = "failed"
            elif function.name == "knowledge_search":
                message = OpenAIResponseOutputMessageFileSearchToolCall(
-                    id=tool_call_id,
+                    id=item_id,
                    queries=[tool_kwargs.get("query", "")],
                    status="completed",
                )
--- a/llama_stack/providers/inline/telemetry/meta_reference/config.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/config.py
@ -9,13 +9,10 @@ from typing import Any

 from pydantic import BaseModel, Field, field_validator

-from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR
-

 class TelemetrySink(StrEnum):
    OTEL_TRACE = "otel_trace"
    OTEL_METRIC = "otel_metric"
-    SQLITE = "sqlite"
    CONSOLE = "console"


@ -30,12 +27,8 @@ class TelemetryConfig(BaseModel):
        description="The service name to use for telemetry",
    )
    sinks: list[TelemetrySink] = Field(
-        default=[TelemetrySink.SQLITE],
-        description="List of telemetry sinks to enable (possible values: otel_trace, otel_metric, sqlite, console)",
-    )
-    sqlite_db_path: str = Field(
-        default_factory=lambda: (RUNTIME_BASE_DIR / "trace_store.db").as_posix(),
-        description="The path to the SQLite database to use for storing traces",
+        default_factory=list,
+        description="List of telemetry sinks to enable (possible values: otel_trace, otel_metric, console)",
    )

    @field_validator("sinks", mode="before")
@ -43,13 +36,12 @@ class TelemetryConfig(BaseModel):
    def validate_sinks(cls, v):
        if isinstance(v, str):
            return [TelemetrySink(sink.strip()) for sink in v.split(",")]
-        return v
+        return v or []

    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, db_name: str = "trace_store.db") -> dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str) -> dict[str, Any]:
        return {
            "service_name": "${env.OTEL_SERVICE_NAME:=\u200b}",
-            "sinks": "${env.TELEMETRY_SINKS:=sqlite}",
-            "sqlite_db_path": "${env.SQLITE_STORE_DIR:=" + __distro_dir__ + "}/" + db_name,
+            "sinks": "${env.TELEMETRY_SINKS:=}",
            "otel_exporter_otlp_endpoint": "${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}",
        }
--- a/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py
@ -1,190 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import json
-import os
-import sqlite3
-import threading
-from datetime import UTC, datetime
-
-from opentelemetry.sdk.trace import SpanProcessor
-from opentelemetry.trace import Span
-from opentelemetry.trace.span import format_span_id, format_trace_id
-
-from llama_stack.providers.utils.telemetry.tracing import LOCAL_ROOT_SPAN_MARKER
-
-
-class SQLiteSpanProcessor(SpanProcessor):
-    def __init__(self, conn_string):
-        """Initialize the SQLite span processor with a connection string."""
-        self.conn_string = conn_string
-        self._local = threading.local()  # Thread-local storage for connections
-        self.setup_database()
-
-    def _get_connection(self):
-        """Get a thread-local database connection."""
-        if not hasattr(self._local, "conn"):
-            try:
-                self._local.conn = sqlite3.connect(self.conn_string)
-            except Exception as e:
-                print(f"Error connecting to SQLite database: {e}")
-                raise
-        return self._local.conn
-
-    def setup_database(self):
-        """Create the necessary tables if they don't exist."""
-        # Create directory if it doesn't exist
-        os.makedirs(os.path.dirname(self.conn_string), exist_ok=True)
-
-        conn = self._get_connection()
-        cursor = conn.cursor()
-
-        cursor.execute(
-            """
-            CREATE TABLE IF NOT EXISTS traces (
-                trace_id TEXT PRIMARY KEY,
-                service_name TEXT,
-                root_span_id TEXT,
-                start_time TIMESTAMP,
-                end_time TIMESTAMP,
-                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
-            )
-        """
-        )
-
-        cursor.execute(
-            """
-            CREATE TABLE IF NOT EXISTS spans (
-                span_id TEXT PRIMARY KEY,
-                trace_id TEXT REFERENCES traces(trace_id),
-                parent_span_id TEXT,
-                name TEXT,
-                start_time TIMESTAMP,
-                end_time TIMESTAMP,
-                attributes TEXT,
-                status TEXT,
-                kind TEXT
-            )
-        """
-        )
-
-        cursor.execute(
-            """
-            CREATE TABLE IF NOT EXISTS span_events (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                span_id TEXT REFERENCES spans(span_id),
-                name TEXT,
-                timestamp TIMESTAMP,
-                attributes TEXT
-            )
-        """
-        )
-
-        cursor.execute(
-            """
-            CREATE INDEX IF NOT EXISTS idx_traces_created_at
-            ON traces(created_at)
-        """
-        )
-
-        conn.commit()
-        cursor.close()
-
-    def on_start(self, span: Span, parent_context=None):
-        """Called when a span starts."""
-        pass
-
-    def on_end(self, span: Span):
-        """Called when a span ends. Export the span data to SQLite."""
-        try:
-            conn = self._get_connection()
-            cursor = conn.cursor()
-
-            trace_id = format_trace_id(span.get_span_context().trace_id)
-            span_id = format_span_id(span.get_span_context().span_id)
-            service_name = span.resource.attributes.get("service.name", "unknown")
-
-            parent_span_id = None
-            parent_context = span.parent
-            if parent_context:
-                parent_span_id = format_span_id(parent_context.span_id)
-
-            # Insert into traces
-            cursor.execute(
-                """
-                INSERT INTO traces (
-                    trace_id, service_name, root_span_id, start_time, end_time
-                ) VALUES (?, ?, ?, ?, ?)
-                ON CONFLICT(trace_id) DO UPDATE SET
-                    root_span_id = COALESCE(root_span_id, excluded.root_span_id),
-                    start_time = MIN(excluded.start_time, start_time),
-                    end_time = MAX(excluded.end_time, end_time)
-            """,
-                (
-                    trace_id,
-                    service_name,
-                    (span_id if span.attributes.get(LOCAL_ROOT_SPAN_MARKER) else None),
-                    datetime.fromtimestamp(span.start_time / 1e9, UTC).isoformat(),
-                    datetime.fromtimestamp(span.end_time / 1e9, UTC).isoformat(),
-                ),
-            )
-
-            # Insert into spans
-            cursor.execute(
-                """
-                INSERT INTO spans (
-                    span_id, trace_id, parent_span_id, name,
-                    start_time, end_time, attributes, status,
-                    kind
-                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
-            """,
-                (
-                    span_id,
-                    trace_id,
-                    parent_span_id,
-                    span.name,
-                    datetime.fromtimestamp(span.start_time / 1e9, UTC).isoformat(),
-                    datetime.fromtimestamp(span.end_time / 1e9, UTC).isoformat(),
-                    json.dumps(dict(span.attributes)),
-                    span.status.status_code.name,
-                    span.kind.name,
-                ),
-            )
-
-            for event in span.events:
-                cursor.execute(
-                    """
-                    INSERT INTO span_events (
-                        span_id, name, timestamp, attributes
-                    ) VALUES (?, ?, ?, ?)
-                """,
-                    (
-                        span_id,
-                        event.name,
-                        datetime.fromtimestamp(event.timestamp / 1e9, UTC).isoformat(),
-                        json.dumps(dict(event.attributes)),
-                    ),
-                )
-
-            conn.commit()
-            cursor.close()
-        except Exception as e:
-            print(f"Error exporting span to SQLite: {e}")
-
-    def shutdown(self):
-        """Cleanup any resources."""
-        # We can't access other threads' connections, so we just close our own
-        if hasattr(self._local, "conn"):
-            try:
-                self._local.conn.close()
-            except Exception as e:
-                print(f"Error closing SQLite connection: {e}")
-            finally:
-                del self._local.conn
-
-    def force_flush(self, timeout_millis=30000):
-        """Force export of spans."""
-        pass
--- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-import datetime
 import threading
 from typing import Any

@ -22,19 +21,11 @@ from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapProp
 from llama_stack.apis.telemetry import (
    Event,
    MetricEvent,
-    MetricLabelMatcher,
-    MetricQueryType,
-    QueryCondition,
-    QueryMetricsResponse,
-    QuerySpanTreeResponse,
-    QueryTracesResponse,
-    Span,
    SpanEndPayload,
    SpanStartPayload,
    SpanStatus,
    StructuredLogEvent,
    Telemetry,
-    Trace,
    UnstructuredLogEvent,
 )
 from llama_stack.core.datatypes import Api
@ -42,11 +33,6 @@ from llama_stack.log import get_logger
 from llama_stack.providers.inline.telemetry.meta_reference.console_span_processor import (
    ConsoleSpanProcessor,
 )
-from llama_stack.providers.inline.telemetry.meta_reference.sqlite_span_processor import (
-    SQLiteSpanProcessor,
-)
-from llama_stack.providers.utils.telemetry.dataset_mixin import TelemetryDatasetMixin
-from llama_stack.providers.utils.telemetry.sqlite_trace_store import SQLiteTraceStore
 from llama_stack.providers.utils.telemetry.tracing import ROOT_SPAN_MARKERS

 from .config import TelemetryConfig, TelemetrySink
@ -68,7 +54,7 @@ def is_tracing_enabled(tracer):
        return span.is_recording()


-class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
+class TelemetryAdapter(Telemetry):
    def __init__(self, config: TelemetryConfig, deps: dict[Api, Any]) -> None:
        self.config = config
        self.datasetio_api = deps.get(Api.datasetio)
@ -111,15 +97,11 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
                    metric_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
                    metrics.set_meter_provider(metric_provider)

-            if TelemetrySink.SQLITE in self.config.sinks:
-                trace.get_tracer_provider().add_span_processor(SQLiteSpanProcessor(self.config.sqlite_db_path))
            if TelemetrySink.CONSOLE in self.config.sinks:
                trace.get_tracer_provider().add_span_processor(ConsoleSpanProcessor(print_attributes=True))

        if TelemetrySink.OTEL_METRIC in self.config.sinks:
            self.meter = metrics.get_meter(__name__)
-        if TelemetrySink.SQLITE in self.config.sinks:
-            self.trace_store = SQLiteTraceStore(self.config.sqlite_db_path)

        self._lock = _global_lock

@ -139,47 +121,6 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
        else:
            raise ValueError(f"Unknown event type: {event}")

-    async def query_metrics(
-        self,
-        metric_name: str,
-        start_time: int,
-        end_time: int | None = None,
-        granularity: str | None = None,
-        query_type: MetricQueryType = MetricQueryType.RANGE,
-        label_matchers: list[MetricLabelMatcher] | None = None,
-    ) -> QueryMetricsResponse:
-        """Query metrics from the telemetry store.
-
-        Args:
-            metric_name: The name of the metric to query (e.g., "prompt_tokens")
-            start_time: Start time as Unix timestamp
-            end_time: End time as Unix timestamp (defaults to now if None)
-            granularity: Time granularity for aggregation
-            query_type: Type of query (RANGE or INSTANT)
-            label_matchers: Label filters to apply
-
-        Returns:
-            QueryMetricsResponse with metric time series data
-        """
-        # Convert timestamps to datetime objects
-        start_dt = datetime.datetime.fromtimestamp(start_time, datetime.UTC)
-        end_dt = datetime.datetime.fromtimestamp(end_time, datetime.UTC) if end_time else None
-
-        # Use SQLite trace store if available
-        if hasattr(self, "trace_store") and self.trace_store:
-            return await self.trace_store.query_metrics(
-                metric_name=metric_name,
-                start_time=start_dt,
-                end_time=end_dt,
-                granularity=granularity,
-                query_type=query_type,
-                label_matchers=label_matchers,
-            )
-        else:
-            raise ValueError(
-                f"In order to query_metrics, you must have {TelemetrySink.SQLITE} set in your telemetry sinks"
-            )
-
    def _log_unstructured(self, event: UnstructuredLogEvent, ttl_seconds: int) -> None:
        with self._lock:
            # Use global storage instead of instance storage
@ -326,39 +267,3 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
                    _GLOBAL_STORAGE["active_spans"].pop(span_id, None)
            else:
                raise ValueError(f"Unknown structured log event: {event}")
-
-    async def query_traces(
-        self,
-        attribute_filters: list[QueryCondition] | None = None,
-        limit: int | None = 100,
-        offset: int | None = 0,
-        order_by: list[str] | None = None,
-    ) -> QueryTracesResponse:
-        return QueryTracesResponse(
-            data=await self.trace_store.query_traces(
-                attribute_filters=attribute_filters,
-                limit=limit,
-                offset=offset,
-                order_by=order_by,
-            )
-        )
-
-    async def get_trace(self, trace_id: str) -> Trace:
-        return await self.trace_store.get_trace(trace_id)
-
-    async def get_span(self, trace_id: str, span_id: str) -> Span:
-        return await self.trace_store.get_span(trace_id, span_id)
-
-    async def get_span_tree(
-        self,
-        span_id: str,
-        attributes_to_return: list[str] | None = None,
-        max_depth: int | None = None,
-    ) -> QuerySpanTreeResponse:
-        return QuerySpanTreeResponse(
-            data=await self.trace_store.get_span_tree(
-                span_id=span_id,
-                attributes_to_return=attributes_to_return,
-                max_depth=max_depth,
-            )
-        )
--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@ -272,7 +272,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
        return RAGQueryResult(
            content=picked,
            metadata={
-                "document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]],
+                "document_ids": [c.document_id for c in chunks[: len(picked)]],
                "chunks": [c.content for c in chunks[: len(picked)]],
                "scores": scores[: len(picked)],
                "vector_db_ids": [c.metadata["vector_db_id"] for c in chunks[: len(picked)]],
--- a/llama_stack/providers/utils/telemetry/dataset_mixin.py
+++ b/llama_stack/providers/utils/telemetry/dataset_mixin.py
@ -1,80 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.telemetry import QueryCondition, QuerySpansResponse, Span
-
-
-class TelemetryDatasetMixin:
-    """Mixin class that provides dataset-related functionality for telemetry providers."""
-
-    datasetio_api: DatasetIO | None
-
-    async def save_spans_to_dataset(
-        self,
-        attribute_filters: list[QueryCondition],
-        attributes_to_save: list[str],
-        dataset_id: str,
-        max_depth: int | None = None,
-    ) -> None:
-        if self.datasetio_api is None:
-            raise RuntimeError("DatasetIO API not available")
-
-        spans = await self.query_spans(
-            attribute_filters=attribute_filters,
-            attributes_to_return=attributes_to_save,
-            max_depth=max_depth,
-        )
-
-        rows = [
-            {
-                "trace_id": span.trace_id,
-                "span_id": span.span_id,
-                "parent_span_id": span.parent_span_id,
-                "name": span.name,
-                "start_time": span.start_time,
-                "end_time": span.end_time,
-                **{attr: span.attributes.get(attr) for attr in attributes_to_save},
-            }
-            for span in spans
-        ]
-
-        await self.datasetio_api.append_rows(dataset_id=dataset_id, rows=rows)
-
-    async def query_spans(
-        self,
-        attribute_filters: list[QueryCondition],
-        attributes_to_return: list[str],
-        max_depth: int | None = None,
-    ) -> QuerySpansResponse:
-        traces = await self.query_traces(attribute_filters=attribute_filters)
-        spans = []
-
-        for trace in traces.data:
-            spans_by_id_resp = await self.get_span_tree(
-                span_id=trace.root_span_id,
-                attributes_to_return=attributes_to_return,
-                max_depth=max_depth,
-            )
-
-            for span in spans_by_id_resp.data.values():
-                if span.attributes and all(
-                    attr in span.attributes and span.attributes[attr] is not None for attr in attributes_to_return
-                ):
-                    spans.append(
-                        Span(
-                            trace_id=trace.root_span_id,
-                            span_id=span.span_id,
-                            parent_span_id=span.parent_span_id,
-                            name=span.name,
-                            start_time=span.start_time,
-                            end_time=span.end_time,
-                            attributes=span.attributes,
-                        )
-                    )
-
-        return QuerySpansResponse(data=spans)
--- a/llama_stack/providers/utils/telemetry/sqlite_trace_store.py
+++ b/llama_stack/providers/utils/telemetry/sqlite_trace_store.py
@ -1,383 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import json
-from datetime import UTC, datetime
-from typing import Protocol
-
-import aiosqlite
-
-from llama_stack.apis.telemetry import (
-    MetricDataPoint,
-    MetricLabel,
-    MetricLabelMatcher,
-    MetricQueryType,
-    MetricSeries,
-    QueryCondition,
-    QueryMetricsResponse,
-    Span,
-    SpanWithStatus,
-    Trace,
-)
-
-
-class TraceStore(Protocol):
-    async def query_traces(
-        self,
-        attribute_filters: list[QueryCondition] | None = None,
-        limit: int | None = 100,
-        offset: int | None = 0,
-        order_by: list[str] | None = None,
-    ) -> list[Trace]: ...
-
-    async def get_span_tree(
-        self,
-        span_id: str,
-        attributes_to_return: list[str] | None = None,
-        max_depth: int | None = None,
-    ) -> dict[str, SpanWithStatus]: ...
-
-    async def query_metrics(
-        self,
-        metric_name: str,
-        start_time: datetime,
-        end_time: datetime | None = None,
-        granularity: str | None = "1d",
-        query_type: MetricQueryType = MetricQueryType.RANGE,
-        label_matchers: list[MetricLabelMatcher] | None = None,
-    ) -> QueryMetricsResponse: ...
-
-
-class SQLiteTraceStore(TraceStore):
-    def __init__(self, conn_string: str):
-        self.conn_string = conn_string
-
-    async def query_metrics(
-        self,
-        metric_name: str,
-        start_time: datetime,
-        end_time: datetime | None = None,
-        granularity: str | None = None,
-        query_type: MetricQueryType = MetricQueryType.RANGE,
-        label_matchers: list[MetricLabelMatcher] | None = None,
-    ) -> QueryMetricsResponse:
-        if end_time is None:
-            end_time = datetime.now(UTC)
-
-        # Build base query
-        if query_type == MetricQueryType.INSTANT:
-            query = """
-                SELECT
-                    se.name,
-                    SUM(CAST(json_extract(se.attributes, '$.value') AS REAL)) as value,
-                    json_extract(se.attributes, '$.unit') as unit,
-                    se.attributes
-                FROM span_events se
-                WHERE se.name = ?
-                AND se.timestamp BETWEEN ? AND ?
-            """
-        else:
-            if granularity:
-                time_format = self._get_time_format_for_granularity(granularity)
-                query = f"""
-                    SELECT
-                        se.name,
-                        SUM(CAST(json_extract(se.attributes, '$.value') AS REAL)) as value,
-                        json_extract(se.attributes, '$.unit') as unit,
-                        se.attributes,
-                        strftime('{time_format}', se.timestamp) as bucket_start
-                    FROM span_events se
-                    WHERE se.name = ?
-                    AND se.timestamp BETWEEN ? AND ?
-                """
-            else:
-                query = """
-                    SELECT
-                        se.name,
-                        json_extract(se.attributes, '$.value') as value,
-                        json_extract(se.attributes, '$.unit') as unit,
-                        se.attributes,
-                        se.timestamp
-                    FROM span_events se
-                    WHERE se.name = ?
-                    AND se.timestamp BETWEEN ? AND ?
-                """
-
-        params = [f"metric.{metric_name}", start_time.isoformat(), end_time.isoformat()]
-
-        # Labels that will be attached to the MetricSeries (preserve matcher labels)
-        all_labels: list[MetricLabel] = []
-        matcher_label_names = set()
-        if label_matchers:
-            for matcher in label_matchers:
-                json_path = f"$.{matcher.name}"
-                if matcher.operator == "=":
-                    query += f" AND json_extract(se.attributes, '{json_path}') = ?"
-                    params.append(matcher.value)
-                elif matcher.operator == "!=":
-                    query += f" AND json_extract(se.attributes, '{json_path}') != ?"
-                    params.append(matcher.value)
-                elif matcher.operator == "=~":
-                    query += f" AND json_extract(se.attributes, '{json_path}') LIKE ?"
-                    params.append(f"%{matcher.value}%")
-                elif matcher.operator == "!~":
-                    query += f" AND json_extract(se.attributes, '{json_path}') NOT LIKE ?"
-                    params.append(f"%{matcher.value}%")
-                # Preserve filter context in output
-                all_labels.append(MetricLabel(name=matcher.name, value=str(matcher.value)))
-                matcher_label_names.add(matcher.name)
-
-        # GROUP BY / ORDER BY logic
-        if query_type == MetricQueryType.RANGE and granularity:
-            group_time_format = self._get_time_format_for_granularity(granularity)
-            query += f" GROUP BY strftime('{group_time_format}', se.timestamp), json_extract(se.attributes, '$.unit')"
-            query += " ORDER BY bucket_start"
-        elif query_type == MetricQueryType.INSTANT:
-            query += " GROUP BY json_extract(se.attributes, '$.unit')"
-        else:
-            query += " ORDER BY se.timestamp"
-
-        # Execute query
-        async with aiosqlite.connect(self.conn_string) as conn:
-            conn.row_factory = aiosqlite.Row
-            async with conn.execute(query, params) as cursor:
-                rows = await cursor.fetchall()
-
-                if not rows:
-                    return QueryMetricsResponse(data=[])
-
-                data_points = []
-                # We want to add attribute labels, but only those not already present as matcher labels.
-                attr_label_names = set()
-                for row in rows:
-                    # Parse JSON attributes safely, if there are no attributes (weird), just don't add the labels to the result.
-                    try:
-                        attributes = json.loads(row["attributes"] or "{}")
-                    except (TypeError, json.JSONDecodeError):
-                        attributes = {}
-
-                    value = row["value"]
-                    unit = row["unit"] or ""
-
-                    # Add labels from attributes without duplicating matcher labels, if we don't do this, there will be a lot of duplicate label in the result.
-                    for k, v in attributes.items():
-                        if k not in ["value", "unit"] and k not in matcher_label_names and k not in attr_label_names:
-                            all_labels.append(MetricLabel(name=k, value=str(v)))
-                            attr_label_names.add(k)
-
-                    # Determine timestamp
-                    if query_type == MetricQueryType.RANGE and granularity:
-                        try:
-                            bucket_start_raw = row["bucket_start"]
-                        except KeyError as e:
-                            raise ValueError(
-                                "DB did not have a bucket_start time in row when using granularity, this indicates improper formatting"
-                            ) from e
-                        # this value could also be there, but be NULL, I think.
-                        if bucket_start_raw is None:
-                            raise ValueError("bucket_start is None check time format and data")
-                        bucket_start = datetime.fromisoformat(bucket_start_raw)
-                        timestamp = int(bucket_start.timestamp())
-                    elif query_type == MetricQueryType.INSTANT:
-                        timestamp = int(datetime.now(UTC).timestamp())
-                    else:
-                        try:
-                            timestamp_raw = row["timestamp"]
-                        except KeyError as e:
-                            raise ValueError(
-                                "DB did not have a timestamp in row, this indicates improper formatting"
-                            ) from e
-                        # this value could also be there, but be NULL, I think.
-                        if timestamp_raw is None:
-                            raise ValueError("timestamp is None check time format and data")
-                        timestamp_iso = datetime.fromisoformat(timestamp_raw)
-                        timestamp = int(timestamp_iso.timestamp())
-
-                    data_points.append(
-                        MetricDataPoint(
-                            timestamp=timestamp,
-                            value=value,
-                            unit=unit,
-                        )
-                    )
-
-                metric_series = [MetricSeries(metric=metric_name, labels=all_labels, values=data_points)]
-                return QueryMetricsResponse(data=metric_series)
-
-    def _get_time_format_for_granularity(self, granularity: str | None) -> str:
-        """Get the SQLite strftime format string for a given granularity.
-        Args:
-            granularity: Granularity string (e.g., "1m", "5m", "1h", "1d")
-        Returns:
-            SQLite strftime format string for the granularity
-        """
-        if granularity is None:
-            raise ValueError("granularity cannot be None for this method - use separate logic for no aggregation")
-
-        if granularity.endswith("d"):
-            return "%Y-%m-%d 00:00:00"
-        elif granularity.endswith("h"):
-            return "%Y-%m-%d %H:00:00"
-        elif granularity.endswith("m"):
-            return "%Y-%m-%d %H:%M:00"
-        else:
-            return "%Y-%m-%d %H:%M:00"  # Default to most granular which will give us the most timestamps.
-
-    async def query_traces(
-        self,
-        attribute_filters: list[QueryCondition] | None = None,
-        limit: int | None = 100,
-        offset: int | None = 0,
-        order_by: list[str] | None = None,
-    ) -> list[Trace]:
-        def build_where_clause() -> tuple[str, list]:
-            if not attribute_filters:
-                return "", []
-
-            ops_map = {"eq": "=", "ne": "!=", "gt": ">", "lt": "<"}
-
-            conditions = [
-                f"json_extract(s.attributes, '$.{condition.key}') {ops_map[condition.op.value]} ?"
-                for condition in attribute_filters
-            ]
-            params = [condition.value for condition in attribute_filters]
-            where_clause = " WHERE " + " AND ".join(conditions)
-            return where_clause, params
-
-        def build_order_clause() -> str:
-            if not order_by:
-                return ""
-
-            order_clauses = []
-            for field in order_by:
-                desc = field.startswith("-")
-                clean_field = field[1:] if desc else field
-                order_clauses.append(f"t.{clean_field} {'DESC' if desc else 'ASC'}")
-            return " ORDER BY " + ", ".join(order_clauses)
-
-        # Build the main query
-        base_query = """
-            WITH matching_traces AS (
-                SELECT DISTINCT t.trace_id
-                FROM traces t
-                JOIN spans s ON t.trace_id = s.trace_id
-                {where_clause}
-            ),
-            filtered_traces AS (
-                SELECT t.trace_id, t.root_span_id, t.start_time, t.end_time
-                FROM matching_traces mt
-                JOIN traces t ON mt.trace_id = t.trace_id
-                LEFT JOIN spans s ON t.trace_id = s.trace_id
-                {order_clause}
-            )
-            SELECT DISTINCT trace_id, root_span_id, start_time, end_time
-            FROM filtered_traces
-            WHERE root_span_id IS NOT NULL
-            LIMIT {limit} OFFSET {offset}
-        """
-
-        where_clause, params = build_where_clause()
-        query = base_query.format(
-            where_clause=where_clause,
-            order_clause=build_order_clause(),
-            limit=limit,
-            offset=offset,
-        )
-
-        # Execute query and return results
-        async with aiosqlite.connect(self.conn_string) as conn:
-            conn.row_factory = aiosqlite.Row
-            async with conn.execute(query, params) as cursor:
-                rows = await cursor.fetchall()
-                return [
-                    Trace(
-                        trace_id=row["trace_id"],
-                        root_span_id=row["root_span_id"],
-                        start_time=datetime.fromisoformat(row["start_time"]),
-                        end_time=datetime.fromisoformat(row["end_time"]),
-                    )
-                    for row in rows
-                ]
-
-    async def get_span_tree(
-        self,
-        span_id: str,
-        attributes_to_return: list[str] | None = None,
-        max_depth: int | None = None,
-    ) -> dict[str, SpanWithStatus]:
-        # Build the attributes selection
-        attributes_select = "s.attributes"
-        if attributes_to_return:
-            json_object = ", ".join(f"'{key}', json_extract(s.attributes, '$.{key}')" for key in attributes_to_return)
-            attributes_select = f"json_object({json_object})"
-
-        # SQLite CTE query with filtered attributes
-        query = f"""
-        WITH RECURSIVE span_tree AS (
-            SELECT s.*, 1 as depth, {attributes_select} as filtered_attributes
-            FROM spans s
-            WHERE s.span_id = ?
-
-            UNION ALL
-
-            SELECT s.*, st.depth + 1, {attributes_select} as filtered_attributes
-            FROM spans s
-            JOIN span_tree st ON s.parent_span_id = st.span_id
-            WHERE (? IS NULL OR st.depth < ?)
-        )
-        SELECT *
-        FROM span_tree
-        ORDER BY depth, start_time
-        """
-
-        spans_by_id = {}
-        async with aiosqlite.connect(self.conn_string) as conn:
-            conn.row_factory = aiosqlite.Row
-            async with conn.execute(query, (span_id, max_depth, max_depth)) as cursor:
-                rows = await cursor.fetchall()
-
-                if not rows:
-                    raise ValueError(f"Span {span_id} not found")
-
-                for row in rows:
-                    span = SpanWithStatus(
-                        span_id=row["span_id"],
-                        trace_id=row["trace_id"],
-                        parent_span_id=row["parent_span_id"],
-                        name=row["name"],
-                        start_time=datetime.fromisoformat(row["start_time"]),
-                        end_time=datetime.fromisoformat(row["end_time"]),
-                        attributes=json.loads(row["filtered_attributes"]),
-                        status=row["status"].lower(),
-                    )
-
-                    spans_by_id[span.span_id] = span
-
-                return spans_by_id
-
-    async def get_trace(self, trace_id: str) -> Trace:
-        query = """
-            SELECT *
-            FROM traces t
-            WHERE t.trace_id = ?
-        """
-        async with aiosqlite.connect(self.conn_string) as conn:
-            conn.row_factory = aiosqlite.Row
-            async with conn.execute(query, (trace_id,)) as cursor:
-                row = await cursor.fetchone()
-                if row is None:
-                    raise ValueError(f"Trace {trace_id} not found")
-                return Trace(**row)
-
-    async def get_span(self, trace_id: str, span_id: str) -> Span:
-        query = "SELECT * FROM spans WHERE trace_id = ? AND span_id = ?"
-        async with aiosqlite.connect(self.conn_string) as conn:
-            conn.row_factory = aiosqlite.Row
-            async with conn.execute(query, (trace_id, span_id)) as cursor:
-                row = await cursor.fetchone()
-                if row is None:
-                    raise ValueError(f"Span {span_id} not found")
-                return Span(**row)
--- a/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-6a62c598.json
+++ b/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-6a62c598.json
@ -0,0 +1,881 @@
+{
+  "test_id": null,
+  "request": {
+    "method": "POST",
+    "url": "https://api.openai.com/v1/v1/models",
+    "headers": {},
+    "body": {},
+    "endpoint": "/v1/models",
+    "model": ""
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-0613",
+          "created": 1686588896,
+          "object": "model",
+          "owned_by": "openai"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4",
+          "created": 1687882411,
+          "object": "model",
+          "owned_by": "openai"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo",
+          "created": 1677610602,
+          "object": "model",
+          "owned_by": "openai"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-search-api-2025-10-14",
+          "created": 1760043960,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-realtime-mini",
+          "created": 1759517133,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-realtime-mini-2025-10-06",
+          "created": 1759517175,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "sora-2",
+          "created": 1759708615,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "sora-2-pro",
+          "created": 1759708663,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "davinci-002",
+          "created": 1692634301,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "babbage-002",
+          "created": 1692634615,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-instruct",
+          "created": 1692901427,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-instruct-0914",
+          "created": 1694122472,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "dall-e-3",
+          "created": 1698785189,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "dall-e-2",
+          "created": 1698798177,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-1106-preview",
+          "created": 1698957206,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-1106",
+          "created": 1698959748,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "tts-1-hd",
+          "created": 1699046015,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "tts-1-1106",
+          "created": 1699053241,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "tts-1-hd-1106",
+          "created": 1699053533,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "text-embedding-3-small",
+          "created": 1705948997,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "text-embedding-3-large",
+          "created": 1705953180,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-0125-preview",
+          "created": 1706037612,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-turbo-preview",
+          "created": 1706037777,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-0125",
+          "created": 1706048358,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-turbo",
+          "created": 1712361441,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-turbo-2024-04-09",
+          "created": 1712601677,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o",
+          "created": 1715367049,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-2024-05-13",
+          "created": 1715368132,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-2024-07-18",
+          "created": 1721172717,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini",
+          "created": 1721172741,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-2024-08-06",
+          "created": 1722814719,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "chatgpt-4o-latest",
+          "created": 1723515131,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-mini-2024-09-12",
+          "created": 1725648979,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-mini",
+          "created": 1725649008,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-realtime-preview-2024-10-01",
+          "created": 1727131766,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-audio-preview-2024-10-01",
+          "created": 1727389042,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-audio-preview",
+          "created": 1727460443,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-realtime-preview",
+          "created": 1727659998,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "omni-moderation-latest",
+          "created": 1731689265,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "omni-moderation-2024-09-26",
+          "created": 1732734466,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-realtime-preview-2024-12-17",
+          "created": 1733945430,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-audio-preview-2024-12-17",
+          "created": 1734034239,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-realtime-preview-2024-12-17",
+          "created": 1734112601,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-audio-preview-2024-12-17",
+          "created": 1734115920,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-2024-12-17",
+          "created": 1734326976,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1",
+          "created": 1734375816,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-realtime-preview",
+          "created": 1734387380,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-audio-preview",
+          "created": 1734387424,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3-mini",
+          "created": 1737146383,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3-mini-2025-01-31",
+          "created": 1738010200,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-2024-11-20",
+          "created": 1739331543,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-search-preview-2025-03-11",
+          "created": 1741388170,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-search-preview",
+          "created": 1741388720,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-search-preview-2025-03-11",
+          "created": 1741390858,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-search-preview",
+          "created": 1741391161,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-transcribe",
+          "created": 1742068463,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-transcribe",
+          "created": 1742068596,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-pro-2025-03-19",
+          "created": 1742251504,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-pro",
+          "created": 1742251791,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-tts",
+          "created": 1742403959,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3-2025-04-16",
+          "created": 1744133301,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o4-mini-2025-04-16",
+          "created": 1744133506,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3",
+          "created": 1744225308,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o4-mini",
+          "created": 1744225351,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-2025-04-14",
+          "created": 1744315746,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1",
+          "created": 1744316542,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-mini-2025-04-14",
+          "created": 1744317547,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-mini",
+          "created": 1744318173,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-nano-2025-04-14",
+          "created": 1744321025,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-nano",
+          "created": 1744321707,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-image-1",
+          "created": 1745517030,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-realtime-preview-2025-06-03",
+          "created": 1748907838,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-audio-preview-2025-06-03",
+          "created": 1748908498,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-transcribe-diarize",
+          "created": 1750798887,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-chat-latest",
+          "created": 1754073306,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-2025-08-07",
+          "created": 1754075360,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5",
+          "created": 1754425777,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-mini-2025-08-07",
+          "created": 1754425867,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-mini",
+          "created": 1754425928,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-nano-2025-08-07",
+          "created": 1754426303,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-nano",
+          "created": 1754426384,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-audio-2025-08-28",
+          "created": 1756256146,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-realtime",
+          "created": 1756271701,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-realtime-2025-08-28",
+          "created": 1756271773,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-audio",
+          "created": 1756339249,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-codex",
+          "created": 1757527818,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-image-1-mini",
+          "created": 1758845821,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-pro-2025-10-06",
+          "created": 1759469707,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-pro",
+          "created": 1759469822,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-audio-mini",
+          "created": 1759512027,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-audio-mini-2025-10-06",
+          "created": 1759512137,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-search-api",
+          "created": 1759514629,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-16k",
+          "created": 1683758102,
+          "object": "model",
+          "owned_by": "openai-internal"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "tts-1",
+          "created": 1681940951,
+          "object": "model",
+          "owned_by": "openai-internal"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "whisper-1",
+          "created": 1677532384,
+          "object": "model",
+          "owned_by": "openai-internal"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "text-embedding-ada-002",
+          "created": 1671217299,
+          "object": "model",
+          "owned_by": "openai-internal"
+        }
+      }
+    ],
+    "is_streaming": false
+  },
+  "id_normalization_mapping": {}
+}
--- a/tests/integration/tool_runtime/recordings/0b29d8fc2fee6f3086820af7b7db15b91bbde548bb9d9531a9e0c9cbbc2c66df.json
+++ b/tests/integration/tool_runtime/recordings/0b29d8fc2fee6f3086820af7b7db15b91bbde548bb9d9531a9e0c9cbbc2c66df.json
@ -0,0 +1,203 @@
+{
+  "test_id": "tests/integration/tool_runtime/test_mcp.py::test_mcp_invocation[txt=ollama/llama3.2:3b-instruct-fp16]",
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "system",
+          "content": "You are a helpful assistant."
+        },
+        {
+          "role": "user",
+          "content": [
+            {
+              "type": "text",
+              "text": "Say hi to the world. Use tools to do so."
+            }
+          ]
+        },
+        {
+          "role": "assistant",
+          "content": "",
+          "tool_calls": [
+            {
+              "index": 0,
+              "id": "call_c1tlwvxc",
+              "type": "function",
+              "function": {
+                "name": "greet_everyone",
+                "arguments": "{\"url\":\"world\"}"
+              }
+            }
+          ]
+        },
+        {
+          "role": "tool",
+          "tool_call_id": "call_c1tlwvxc",
+          "content": [
+            {
+              "type": "text",
+              "text": "Hello, world!"
+            }
+          ]
+        },
+        {
+          "role": "assistant",
+          "content": "I apologize, it seems like my previous response from the tool did not translate to a clear greeting.\n\nLet me try again using a different approach. I can use a text-to-speech conversion tool to say hello in various languages:\n\n\"Hello\"\n\n(or)\n\n\"Pong\"\n\n(in multiple languages such as Spanish: \"Hola\", French: \"Bonjour\", Chinese:, Japanese:, Korean:, or many others)"
+        },
+        {
+          "role": "user",
+          "content": [
+            {
+              "type": "text",
+              "text": "What is the boiling point of polyjuice? Use tools to answer."
+            }
+          ]
+        }
+      ],
+      "stream": true,
+      "stream_options": {
+        "include_usage": true
+      },
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "greet_everyone",
+            "parameters": {
+              "properties": {
+                "url": {
+                  "title": "Url",
+                  "type": "string"
+                }
+              },
+              "required": [
+                "url"
+              ],
+              "title": "greet_everyoneArguments",
+              "type": "object"
+            }
+          }
+        },
+        {
+          "type": "function",
+          "function": {
+            "name": "get_boiling_point",
+            "description": "\n        Returns the boiling point of a liquid in Celsius or Fahrenheit.\n\n        :param liquid_name: The name of the liquid\n        :param celsius: Whether to return the boiling point in Celsius\n        :return: The boiling point of the liquid in Celcius or Fahrenheit\n        ",
+            "parameters": {
+              "properties": {
+                "liquid_name": {
+                  "title": "Liquid Name",
+                  "type": "string"
+                },
+                "celsius": {
+                  "default": true,
+                  "title": "Celsius",
+                  "type": "boolean"
+                }
+              },
+              "required": [
+                "liquid_name"
+              ],
+              "title": "get_boiling_pointArguments",
+              "type": "object"
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-0b29d8fc2fee",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": "call_h8atvnz3",
+                    "function": {
+                      "arguments": "{\"celsius\":\"true\",\"liquid_name\":\"polyjuice\"}",
+                      "name": "get_boiling_point"
+                    },
+                    "type": "function"
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-0b29d8fc2fee",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": "tool_calls",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-0b29d8fc2fee",
+          "choices": [],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": {
+            "completion_tokens": 29,
+            "prompt_tokens": 414,
+            "total_tokens": 443,
+            "completion_tokens_details": null,
+            "prompt_tokens_details": null
+          }
+        }
+      }
+    ],
+    "is_streaming": true
+  },
+  "id_normalization_mapping": {}
+}
--- a/tests/integration/tool_runtime/recordings/522bf95412d73568d38c20303aba38543e717c9b270584bbf8134708f0600869.json
+++ b/tests/integration/tool_runtime/recordings/522bf95412d73568d38c20303aba38543e717c9b270584bbf8134708f0600869.json
--- a/tests/integration/tool_runtime/recordings/8850db6a210daca8a8c12ffd70edc7eefa75ac231676f7ad584de5d75883cab9.json
+++ b/tests/integration/tool_runtime/recordings/8850db6a210daca8a8c12ffd70edc7eefa75ac231676f7ad584de5d75883cab9.json
--- a/tests/integration/tool_runtime/recordings/9ec84ac5a886b5652089fa72c24572b3f8045cfebea8f3ae8678b304288e5a62.json
+++ b/tests/integration/tool_runtime/recordings/9ec84ac5a886b5652089fa72c24572b3f8045cfebea8f3ae8678b304288e5a62.json
@ -0,0 +1,570 @@
+{
+  "test_id": "tests/integration/tool_runtime/test_mcp.py::test_mcp_invocation[txt=ollama/llama3.2:3b-instruct-fp16]",
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "system",
+          "content": "You are a helpful assistant."
+        },
+        {
+          "role": "user",
+          "content": [
+            {
+              "type": "text",
+              "text": "Say hi to the world. Use tools to do so."
+            }
+          ]
+        },
+        {
+          "role": "assistant",
+          "content": "",
+          "tool_calls": [
+            {
+              "index": 0,
+              "id": "call_81wznfre",
+              "type": "function",
+              "function": {
+                "name": "greet_everyone",
+                "arguments": "{\"url\":\"world\"}"
+              }
+            }
+          ]
+        },
+        {
+          "role": "tool",
+          "tool_call_id": "call_81wznfre",
+          "content": [
+            {
+              "type": "text",
+              "text": "Hello, world!"
+            }
+          ]
+        }
+      ],
+      "stream": true,
+      "stream_options": {
+        "include_usage": true
+      },
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "greet_everyone",
+            "parameters": {
+              "properties": {
+                "url": {
+                  "title": "Url",
+                  "type": "string"
+                }
+              },
+              "required": [
+                "url"
+              ],
+              "title": "greet_everyoneArguments",
+              "type": "object"
+            }
+          }
+        },
+        {
+          "type": "function",
+          "function": {
+            "name": "get_boiling_point",
+            "description": "\n        Returns the boiling point of a liquid in Celsius or Fahrenheit.\n\n        :param liquid_name: The name of the liquid\n        :param celsius: Whether to return the boiling point in Celsius\n        :return: The boiling point of the liquid in Celcius or Fahrenheit\n        ",
+            "parameters": {
+              "properties": {
+                "liquid_name": {
+                  "title": "Liquid Name",
+                  "type": "string"
+                },
+                "celsius": {
+                  "default": true,
+                  "title": "Celsius",
+                  "type": "boolean"
+                }
+              },
+              "required": [
+                "liquid_name"
+              ],
+              "title": "get_boiling_pointArguments",
+              "type": "object"
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9ec84ac5a886",
+          "choices": [
+            {
+              "delta": {
+                "content": "<|python_tag|>",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9ec84ac5a886",
+          "choices": [
+            {
+              "delta": {
+                "content": "{\"",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9ec84ac5a886",
+          "choices": [
+            {
+              "delta": {
+                "content": "type",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9ec84ac5a886",
+          "choices": [
+            {
+              "delta": {
+                "content": "\":",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9ec84ac5a886",
+          "choices": [
+            {
+              "delta": {
+                "content": " \"",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9ec84ac5a886",
+          "choices": [
+            {
+              "delta": {
+                "content": "message",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9ec84ac5a886",
+          "choices": [
+            {
+              "delta": {
+                "content": "\",",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9ec84ac5a886",
+          "choices": [
+            {
+              "delta": {
+                "content": " \"",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9ec84ac5a886",
+          "choices": [
+            {
+              "delta": {
+                "content": "text",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9ec84ac5a886",
+          "choices": [
+            {
+              "delta": {
+                "content": "\":",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9ec84ac5a886",
+          "choices": [
+            {
+              "delta": {
+                "content": " \"",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9ec84ac5a886",
+          "choices": [
+            {
+              "delta": {
+                "content": "Hello",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9ec84ac5a886",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9ec84ac5a886",
+          "choices": [
+            {
+              "delta": {
+                "content": " world",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9ec84ac5a886",
+          "choices": [
+            {
+              "delta": {
+                "content": "!\"",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9ec84ac5a886",
+          "choices": [
+            {
+              "delta": {
+                "content": "}",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9ec84ac5a886",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9ec84ac5a886",
+          "choices": [],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": {
+            "completion_tokens": 17,
+            "prompt_tokens": 106,
+            "total_tokens": 123,
+            "completion_tokens_details": null,
+            "prompt_tokens_details": null
+          }
+        }
+      }
+    ],
+    "is_streaming": true
+  },
+  "id_normalization_mapping": {}
+}
--- a/tests/integration/tool_runtime/recordings/bf91c47b07a1b97255474e7f219efd79684231000d7a025deea75e0378d5bf45.json
+++ b/tests/integration/tool_runtime/recordings/bf91c47b07a1b97255474e7f219efd79684231000d7a025deea75e0378d5bf45.json
--- a/tests/integration/tool_runtime/recordings/d1b1773d2f2a6dea3ed84a136a5e379edfdbeee01b2f19a40fa5b983a631a036.json
+++ b/tests/integration/tool_runtime/recordings/d1b1773d2f2a6dea3ed84a136a5e379edfdbeee01b2f19a40fa5b983a631a036.json
--- a/tests/integration/tool_runtime/recordings/d516b8d54a42c57c19cbc0a6b90aed82af458af76db2b58b0369e75ad9fb4f3f.json
+++ b/tests/integration/tool_runtime/recordings/d516b8d54a42c57c19cbc0a6b90aed82af458af76db2b58b0369e75ad9fb4f3f.json
@ -0,0 +1,165 @@
+{
+  "test_id": "tests/integration/tool_runtime/test_mcp.py::test_mcp_invocation[txt=ollama/llama3.2:3b-instruct-fp16]",
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "system",
+          "content": "You are a helpful assistant."
+        },
+        {
+          "role": "user",
+          "content": [
+            {
+              "type": "text",
+              "text": "Say hi to the world. Use tools to do so."
+            }
+          ]
+        }
+      ],
+      "stream": true,
+      "stream_options": {
+        "include_usage": true
+      },
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "greet_everyone",
+            "parameters": {
+              "properties": {
+                "url": {
+                  "title": "Url",
+                  "type": "string"
+                }
+              },
+              "required": [
+                "url"
+              ],
+              "title": "greet_everyoneArguments",
+              "type": "object"
+            }
+          }
+        },
+        {
+          "type": "function",
+          "function": {
+            "name": "get_boiling_point",
+            "description": "\n        Returns the boiling point of a liquid in Celsius or Fahrenheit.\n\n        :param liquid_name: The name of the liquid\n        :param celsius: Whether to return the boiling point in Celsius\n        :return: The boiling point of the liquid in Celcius or Fahrenheit\n        ",
+            "parameters": {
+              "properties": {
+                "liquid_name": {
+                  "title": "Liquid Name",
+                  "type": "string"
+                },
+                "celsius": {
+                  "default": true,
+                  "title": "Celsius",
+                  "type": "boolean"
+                }
+              },
+              "required": [
+                "liquid_name"
+              ],
+              "title": "get_boiling_pointArguments",
+              "type": "object"
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d516b8d54a42",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": "call_c1tlwvxc",
+                    "function": {
+                      "arguments": "{\"url\":\"world\"}",
+                      "name": "greet_everyone"
+                    },
+                    "type": "function"
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d516b8d54a42",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": "tool_calls",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d516b8d54a42",
+          "choices": [],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": {
+            "completion_tokens": 19,
+            "prompt_tokens": 278,
+            "total_tokens": 297,
+            "completion_tokens_details": null,
+            "prompt_tokens_details": null
+          }
+        }
+      }
+    ],
+    "is_streaming": true
+  },
+  "id_normalization_mapping": {}
+}
--- a/tests/integration/tool_runtime/recordings/dd48d347f0abca853227d4eae1bab11dfba06a8db74241a3f8cb6f6a13e8a2e8.json
+++ b/tests/integration/tool_runtime/recordings/dd48d347f0abca853227d4eae1bab11dfba06a8db74241a3f8cb6f6a13e8a2e8.json
@ -0,0 +1,934 @@
+{
+  "test_id": "tests/integration/tool_runtime/test_mcp.py::test_mcp_invocation[txt=ollama/llama3.2:3b-instruct-fp16]",
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "system",
+          "content": "You are a helpful assistant."
+        },
+        {
+          "role": "user",
+          "content": [
+            {
+              "type": "text",
+              "text": "Say hi to the world. Use tools to do so."
+            }
+          ]
+        },
+        {
+          "role": "assistant",
+          "content": "",
+          "tool_calls": [
+            {
+              "index": 0,
+              "id": "call_j1l3nnv3",
+              "type": "function",
+              "function": {
+                "name": "greet_everyone",
+                "arguments": "{\"url\":\"world\"}"
+              }
+            }
+          ]
+        },
+        {
+          "role": "tool",
+          "tool_call_id": "call_j1l3nnv3",
+          "content": [
+            {
+              "type": "text",
+              "text": "Hello, world!"
+            }
+          ]
+        }
+      ],
+      "stream": true,
+      "stream_options": {
+        "include_usage": true
+      },
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "greet_everyone",
+            "parameters": {
+              "properties": {
+                "url": {
+                  "title": "Url",
+                  "type": "string"
+                }
+              },
+              "required": [
+                "url"
+              ],
+              "title": "greet_everyoneArguments",
+              "type": "object"
+            }
+          }
+        },
+        {
+          "type": "function",
+          "function": {
+            "name": "get_boiling_point",
+            "description": "\n        Returns the boiling point of a liquid in Celsius or Fahrenheit.\n\n        :param liquid_name: The name of the liquid\n        :param celsius: Whether to return the boiling point in Celsius\n        :return: The boiling point of the liquid in Celcius or Fahrenheit\n        ",
+            "parameters": {
+              "properties": {
+                "liquid_name": {
+                  "title": "Liquid Name",
+                  "type": "string"
+                },
+                "celsius": {
+                  "default": true,
+                  "title": "Celsius",
+                  "type": "boolean"
+                }
+              },
+              "required": [
+                "liquid_name"
+              ],
+              "title": "get_boiling_pointArguments",
+              "type": "object"
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": "<|python_tag|>",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": "{\"",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": "message",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": "\":",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": " \"",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": "I",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": "'m",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": " sorry",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": " I",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": " couldn",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": "'t",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": " greet",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": " the",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": " world",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": ".\",",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": " \"",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": "exception",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": "\":",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": " \"",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": "No",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": " public",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": " method",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": " named",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": " greet",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": "_every",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": "one",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": " with",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": " two",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": " parameters",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": "\"}",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-dd48d347f0ab",
+          "choices": [],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": {
+            "completion_tokens": 31,
+            "prompt_tokens": 106,
+            "total_tokens": 137,
+            "completion_tokens_details": null,
+            "prompt_tokens_details": null
+          }
+        }
+      }
+    ],
+    "is_streaming": true
+  },
+  "id_normalization_mapping": {}
+}
--- a/tests/integration/tool_runtime/test_mcp.py
+++ b/tests/integration/tool_runtime/test_mcp.py
@ -11,7 +11,6 @@ from llama_stack_client.lib.agents.agent import Agent
 from llama_stack_client.lib.agents.turn_events import StepCompleted, StepProgress, ToolCallIssuedDelta

 from llama_stack import LlamaStackAsLibraryClient
-from llama_stack.core.datatypes import AuthenticationRequiredError

 AUTH_TOKEN = "test-token"

@ -82,9 +81,11 @@ def test_mcp_invocation(llama_stack_client, text_model_id, mcp_server):
            "server_label": test_toolgroup_id,
            "require_approval": "never",
            "allowed_tools": [tool.name for tool in tools_list],
+            "headers": {
+                "Authorization": f"Bearer {AUTH_TOKEN}",
+            },
        }
    ]
-
    agent = Agent(
        client=llama_stack_client,
        model=text_model_id,
@ -111,47 +112,22 @@ def test_mcp_invocation(llama_stack_client, text_model_id, mcp_server):
            extra_headers=auth_headers,
        )
    )
-
    events = [chunk.event for chunk in chunks]
+
    final_response = next((chunk.response for chunk in reversed(chunks) if chunk.response), None)
    assert final_response is not None

    issued_calls = [
        event for event in events if isinstance(event, StepProgress) and isinstance(event.delta, ToolCallIssuedDelta)
    ]
-    assert issued_calls and issued_calls[0].delta.tool_name == "greet_everyone"
+    assert issued_calls
+
+    assert issued_calls[-1].delta.tool_name == "greet_everyone"

    tool_events = [
        event for event in events if isinstance(event, StepCompleted) and event.step_type == "tool_execution"
    ]
-    assert tool_events and tool_events[0].result.tool_calls[0].tool_name == "greet_everyone"
+    assert tool_events
+    assert tool_events[-1].result.tool_calls[0].tool_name == "greet_everyone"

    assert "hello" in final_response.output_text.lower()
-
-    # when streaming, we currently don't check auth headers upfront and fail the request
-    # early. but we should at least be generating a 401 later in the process.
-    response_stream = agent.create_turn(
-        session_id=session_id,
-        messages=[
-            {
-                "type": "message",
-                "role": "user",
-                "content": [
-                    {
-                        "type": "input_text",
-                        "text": "What is the boiling point of polyjuice? Use tools to answer.",
-                    }
-                ],
-            }
-        ],
-        stream=True,
-    )
-    if isinstance(llama_stack_client, LlamaStackAsLibraryClient):
-        with pytest.raises(AuthenticationRequiredError):
-            for _ in response_stream:
-                pass
-    else:
-        error_chunks = [chunk for chunk in response_stream if "error" in chunk.model_dump()]
-        assert len(error_chunks) == 1
-        chunk = error_chunks[0].model_dump()
-        assert "Unauthorized" in chunk["error"]["message"]
--- a/tests/unit/providers/vector_io/test_vector_io_openai_vector_stores.py
+++ b/tests/unit/providers/vector_io/test_vector_io_openai_vector_stores.py
@ -128,6 +128,37 @@ async def test_insert_chunks_missing_db_raises(vector_io_adapter):
        await vector_io_adapter.insert_chunks("db_not_exist", [])


+async def test_insert_chunks_with_missing_document_id(vector_io_adapter):
+    """Ensure no KeyError when document_id is missing or in different places."""
+    from llama_stack.apis.vector_io import Chunk, ChunkMetadata
+
+    fake_index = AsyncMock()
+    vector_io_adapter.cache["db1"] = fake_index
+
+    # Various document_id scenarios that shouldn't crash
+    chunks = [
+        Chunk(content="has doc_id in metadata", metadata={"document_id": "doc-1"}),
+        Chunk(content="no doc_id anywhere", metadata={"source": "test"}),
+        Chunk(content="doc_id in chunk_metadata", chunk_metadata=ChunkMetadata(document_id="doc-3")),
+    ]
+
+    # Should work without KeyError
+    await vector_io_adapter.insert_chunks("db1", chunks)
+    fake_index.insert_chunks.assert_awaited_once()
+
+
+async def test_document_id_with_invalid_type_raises_error():
+    """Ensure TypeError is raised when document_id is not a string."""
+    from llama_stack.apis.vector_io import Chunk
+
+    # Integer document_id should raise TypeError
+    chunk = Chunk(content="test", metadata={"document_id": 12345})
+    with pytest.raises(TypeError) as exc_info:
+        _ = chunk.document_id
+    assert "metadata['document_id'] must be a string" in str(exc_info.value)
+    assert "got int" in str(exc_info.value)
+
+
 async def test_query_chunks_calls_underlying_index_and_returns(vector_io_adapter):
    expected = QueryChunksResponse(chunks=[Chunk(content="c1")], scores=[0.1])
    fake_index = AsyncMock(query_chunks=AsyncMock(return_value=expected))