From 935b8e28de29400a4b42d8b54169341c5244fec7 Mon Sep 17 00:00:00 2001
From: slekkala1 <swapna942@meta.com>
Date: Wed, 10 Sep 2025 08:48:01 -0700
Subject: [PATCH 01/30] fix: Fireworks chat completion broken due to telemetry
 (#3392)

# What does this PR do?
Fix fireworks chat completion broken due to telemetry expecting
response.usage
 Closes https://github.com/llamastack/llama-stack/issues/3391

## Test Plan
1. `uv run --with llama-stack llama stack build --distro starter
--image-type venv --run`
Try

```
curl -X POST http://0.0.0.0:8321/v1/openai/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
      "model": "fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct",
      "messages": [{"role": "user", "content": "Hello!"}]
    }'
```
```
{"id":"chatcmpl-ee922a08-0df0-4974-b0d3-b322113e8bc0","choices":[{"message":{"role":"assistant","content":"Hello! How can I assist you today?","name":null,"tool_calls":null},"finish_reason":"stop","index":0,"logprobs":null}],"object":"chat.completion","created":1757456375,"model":"fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct"}%
```

Without fix fails as mentioned in
https://github.com/llamastack/llama-stack/issues/3391

Co-authored-by: Francisco Arceo <arceofrancisco@gmail.com>
---
 llama_stack/core/routers/inference.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py
index 045093fe0..23972deb5 100644
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@@ -423,7 +423,7 @@ class InferenceRouter(Inference):
             # response_stream = await provider.openai_completion(**params)
 
         response = await provider.openai_completion(**params)
-        if self.telemetry:
+        if self.telemetry and getattr(response, "usage", None):
             metrics = self._construct_metrics(
                 prompt_tokens=response.usage.prompt_tokens,
                 completion_tokens=response.usage.completion_tokens,
@@ -529,7 +529,7 @@ class InferenceRouter(Inference):
         if self.store:
             asyncio.create_task(self.store.store_chat_completion(response, messages))
 
-        if self.telemetry:
+        if self.telemetry and getattr(response, "usage", None):
             metrics = self._construct_metrics(
                 prompt_tokens=response.usage.prompt_tokens,
                 completion_tokens=response.usage.completion_tokens,

From f6bf36343df7c69c9f26ae5163cbfb6491ca7247 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Wed, 10 Sep 2025 11:52:23 -0700
Subject: [PATCH 02/30] chore: logging perf improvments (#3393)

# What does this PR do?
- Use BackgroundLogger when logging metric events.
- Reuse event loop in BackgroundLogger

## Test Plan
```
cd /docs/source/distributions/k8s-benchmark
# start mock server
python openai-mock-server.py --port 8000
# start stack server
LLAMA_STACK_LOGGING="all=WARNING" uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml
# run benchmark script
uv run python3 benchmark.py --duration 120 --concurrent 50 --base-url=http://localhost:8321/v1/openai/v1 --model=vllm-inference/meta-llama/Llama-3.2-3B-Instruct
```
### RPS from 57 -> 62
---
 llama_stack/core/routers/inference.py         | 14 ++++----
 .../providers/utils/telemetry/tracing.py      | 34 +++++++++++++------
 2 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py
index 23972deb5..9593dd5b9 100644
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@@ -63,7 +63,7 @@ from llama_stack.models.llama.llama3.chat_format import ChatFormat
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
 from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
 from llama_stack.providers.utils.inference.inference_store import InferenceStore
-from llama_stack.providers.utils.telemetry.tracing import get_current_span
+from llama_stack.providers.utils.telemetry.tracing import enqueue_event, get_current_span
 
 logger = get_logger(name=__name__, category="core::routers")
 
@@ -160,7 +160,7 @@ class InferenceRouter(Inference):
         metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model)
         if self.telemetry:
             for metric in metrics:
-                await self.telemetry.log_event(metric)
+                enqueue_event(metric)
         return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
 
     async def _count_tokens(
@@ -431,7 +431,7 @@ class InferenceRouter(Inference):
                 model=model_obj,
             )
             for metric in metrics:
-                await self.telemetry.log_event(metric)
+                enqueue_event(metric)
 
             # these metrics will show up in the client response.
             response.metrics = (
@@ -537,7 +537,7 @@ class InferenceRouter(Inference):
                 model=model_obj,
             )
             for metric in metrics:
-                await self.telemetry.log_event(metric)
+                enqueue_event(metric)
             # these metrics will show up in the client response.
             response.metrics = (
                 metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
@@ -664,7 +664,7 @@ class InferenceRouter(Inference):
                             "completion_tokens",
                             "total_tokens",
                         ]:  # Only log completion and total tokens
-                            await self.telemetry.log_event(metric)
+                            enqueue_event(metric)
 
                         # Return metrics in response
                         async_metrics = [
@@ -710,7 +710,7 @@ class InferenceRouter(Inference):
             )
             for metric in completion_metrics:
                 if metric.metric in ["completion_tokens", "total_tokens"]:  # Only log completion and total tokens
-                    await self.telemetry.log_event(metric)
+                    enqueue_event(metric)
 
             # Return metrics in response
             return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics]
@@ -806,7 +806,7 @@ class InferenceRouter(Inference):
                             model=model,
                         )
                         for metric in metrics:
-                            await self.telemetry.log_event(metric)
+                            enqueue_event(metric)
 
                 yield chunk
         finally:
diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py
index 7694003b5..9969b1055 100644
--- a/llama_stack/providers/utils/telemetry/tracing.py
+++ b/llama_stack/providers/utils/telemetry/tracing.py
@@ -18,6 +18,7 @@ from functools import wraps
 from typing import Any
 
 from llama_stack.apis.telemetry import (
+    Event,
     LogSeverity,
     Span,
     SpanEndPayload,
@@ -98,7 +99,7 @@ class BackgroundLogger:
     def __init__(self, api: Telemetry, capacity: int = 100000):
         self.api = api
         self.log_queue: queue.Queue[Any] = queue.Queue(maxsize=capacity)
-        self.worker_thread = threading.Thread(target=self._process_logs, daemon=True)
+        self.worker_thread = threading.Thread(target=self._worker, daemon=True)
         self.worker_thread.start()
         self._last_queue_full_log_time: float = 0.0
         self._dropped_since_last_notice: int = 0
@@ -118,12 +119,16 @@ class BackgroundLogger:
                 self._last_queue_full_log_time = current_time
                 self._dropped_since_last_notice = 0
 
-    def _process_logs(self):
+    def _worker(self):
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        loop.run_until_complete(self._process_logs())
+
+    async def _process_logs(self):
         while True:
             try:
                 event = self.log_queue.get()
-                # figure out how to use a thread's native loop
-                asyncio.run(self.api.log_event(event))
+                await self.api.log_event(event)
             except Exception:
                 import traceback
 
@@ -136,6 +141,19 @@ class BackgroundLogger:
         self.log_queue.join()
 
 
+def enqueue_event(event: Event) -> None:
+    """Enqueue a telemetry event to the background logger if available.
+
+    This provides a non-blocking path for routers and other hot paths to
+    submit telemetry without awaiting the Telemetry API, reducing contention
+    with the main event loop.
+    """
+    global BACKGROUND_LOGGER
+    if BACKGROUND_LOGGER is None:
+        raise RuntimeError("Telemetry API not initialized")
+    BACKGROUND_LOGGER.log_event(event)
+
+
 class TraceContext:
     spans: list[Span] = []
 
@@ -256,11 +274,7 @@ class TelemetryHandler(logging.Handler):
         if record.module in ("asyncio", "selector_events"):
             return
 
-        global CURRENT_TRACE_CONTEXT, BACKGROUND_LOGGER
-
-        if BACKGROUND_LOGGER is None:
-            raise RuntimeError("Telemetry API not initialized")
-
+        global CURRENT_TRACE_CONTEXT
         context = CURRENT_TRACE_CONTEXT.get()
         if context is None:
             return
@@ -269,7 +283,7 @@ class TelemetryHandler(logging.Handler):
         if span is None:
             return
 
-        BACKGROUND_LOGGER.log_event(
+        enqueue_event(
             UnstructuredLogEvent(
                 trace_id=span.trace_id,
                 span_id=span.span_id,

From a6b1588dc612df097d4fecce317547515b281ec6 Mon Sep 17 00:00:00 2001
From: Francisco Arceo <arceofrancisco@gmail.com>
Date: Wed, 10 Sep 2025 12:53:38 -0600
Subject: [PATCH 03/30] revert: Fireworks chat completion broken due to
 telemetry (#3402)

Reverts llamastack/llama-stack#3392
---
 llama_stack/core/routers/inference.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py
index 9593dd5b9..2ed2d0439 100644
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@@ -423,7 +423,7 @@ class InferenceRouter(Inference):
             # response_stream = await provider.openai_completion(**params)
 
         response = await provider.openai_completion(**params)
-        if self.telemetry and getattr(response, "usage", None):
+        if self.telemetry:
             metrics = self._construct_metrics(
                 prompt_tokens=response.usage.prompt_tokens,
                 completion_tokens=response.usage.completion_tokens,
@@ -529,7 +529,7 @@ class InferenceRouter(Inference):
         if self.store:
             asyncio.create_task(self.store.store_chat_completion(response, messages))
 
-        if self.telemetry and getattr(response, "usage", None):
+        if self.telemetry:
             metrics = self._construct_metrics(
                 prompt_tokens=response.usage.prompt_tokens,
                 completion_tokens=response.usage.completion_tokens,

From e6edc1f93425032f35f4198a197ba31b5b11d8ee Mon Sep 17 00:00:00 2001
From: Derek Higgins <derekh@redhat.com>
Date: Wed, 10 Sep 2025 19:54:10 +0100
Subject: [PATCH 04/30] fix: unbound variable error in
 schedule-record-workflow.sh (#3401)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Initialize INPUTS variable to prevent 'unbound variable' error

Fixes:
./scripts/github/schedule-record-workflow.sh: line 246: INPUTS: unbound
variable │
---
 scripts/github/schedule-record-workflow.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/github/schedule-record-workflow.sh b/scripts/github/schedule-record-workflow.sh
index c292e53e6..44b0947b6 100755
--- a/scripts/github/schedule-record-workflow.sh
+++ b/scripts/github/schedule-record-workflow.sh
@@ -239,8 +239,9 @@ echo "Test pattern: ${TEST_PATTERN:-"(none)"}"
 echo ""
 
 # Prepare inputs for gh workflow run
+INPUTS=
 if [[ -n "$TEST_SUBDIRS" ]]; then
-    INPUTS="-f subdirs='$TEST_SUBDIRS'"
+    INPUTS="$INPUTS -f subdirs='$TEST_SUBDIRS'"
 fi
 if [[ -n "$TEST_SETUP" ]]; then
     INPUTS="$INPUTS -f test-setup='$TEST_SETUP'"

From e980436a2ed98dd725f76dfcec12235ed1d6cc82 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Wed, 10 Sep 2025 11:57:42 -0700
Subject: [PATCH 05/30] chore: introduce write queue for inference_store
 (#3383)

# What does this PR do?
Adds a write worker queue for writes to inference store. This avoids
overwhelming request processing with slow inference writes.

## Test Plan

Benchmark:
```
cd /docs/source/distributions/k8s-benchmark
# start mock server
python openai-mock-server.py --port 8000
# start stack server
LLAMA_STACK_LOGGING="all=WARNING" uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml
# run benchmark script
uv run python3 benchmark.py --duration 120 --concurrent 50 --base-url=http://localhost:8321/v1/openai/v1 --model=vllm-inference/meta-llama/Llama-3.2-3B-Instruct
```
## RPS from 21 -> 57
---
 .../distributions/k8s-benchmark/benchmark.py  | 19 ++--
 .../k8s-benchmark/stack_run_config.yaml       |  9 ++
 llama_stack/core/datatypes.py                 | 13 ++-
 llama_stack/core/routers/__init__.py          |  5 +-
 llama_stack/core/routers/inference.py         |  5 +
 .../utils/inference/inference_store.py        | 98 +++++++++++++++++--
 .../utils/inference/test_inference_store.py   | 12 +++
 7 files changed, 139 insertions(+), 22 deletions(-)

diff --git a/docs/source/distributions/k8s-benchmark/benchmark.py b/docs/source/distributions/k8s-benchmark/benchmark.py
index 3d0d18150..83ba9602a 100644
--- a/docs/source/distributions/k8s-benchmark/benchmark.py
+++ b/docs/source/distributions/k8s-benchmark/benchmark.py
@@ -58,14 +58,6 @@ class BenchmarkStats:
         
         print(f"\n{'='*60}")
         print(f"BENCHMARK RESULTS")
-        print(f"{'='*60}")
-        print(f"Total time: {total_time:.2f}s")
-        print(f"Concurrent users: {self.concurrent_users}")
-        print(f"Total requests: {self.total_requests}")
-        print(f"Successful requests: {self.success_count}")
-        print(f"Failed requests: {len(self.errors)}")
-        print(f"Success rate: {success_rate:.1f}%")
-        print(f"Requests per second: {self.success_count / total_time:.2f}")
         
         print(f"\nResponse Time Statistics:")
         print(f"  Mean: {statistics.mean(self.response_times):.3f}s")
@@ -106,6 +98,15 @@ class BenchmarkStats:
             print(f"  Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
             print(f"  Total chunks received: {sum(self.chunks_received)}")
         
+        print(f"{'='*60}")
+        print(f"Total time: {total_time:.2f}s")
+        print(f"Concurrent users: {self.concurrent_users}")
+        print(f"Total requests: {self.total_requests}")
+        print(f"Successful requests: {self.success_count}")
+        print(f"Failed requests: {len(self.errors)}")
+        print(f"Success rate: {success_rate:.1f}%")
+        print(f"Requests per second: {self.success_count / total_time:.2f}")
+        
         if self.errors:
             print(f"\nErrors (showing first 5):")
             for error in self.errors[:5]:
@@ -215,7 +216,7 @@ class LlamaStackBenchmark:
                         await asyncio.sleep(1)  # Report every second
                         if time.time() >= last_report_time + 10:  # Report every 10 seconds
                             elapsed = time.time() - stats.start_time
-                            print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s")
+                            print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}")
                             last_report_time = time.time()
                     except asyncio.CancelledError:
                         break
diff --git a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
index f8ff7811b..5a9e2ae4f 100644
--- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
@@ -2,6 +2,7 @@ version: '2'
 image_name: kubernetes-benchmark-demo
 apis:
 - agents
+- files
 - inference
 - files
 - safety
@@ -20,6 +21,14 @@ providers:
   - provider_id: sentence-transformers
     provider_type: inline::sentence-transformers
     config: {}
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+      metadata_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
   vector_io:
   - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
     provider_type: remote::chromadb
diff --git a/llama_stack/core/datatypes.py b/llama_stack/core/datatypes.py
index 0f348b067..faaeefd01 100644
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@@ -431,6 +431,12 @@ class ServerConfig(BaseModel):
     )
 
 
+class InferenceStoreConfig(BaseModel):
+    sql_store_config: SqlStoreConfig
+    max_write_queue_size: int = Field(default=10000, description="Max queued writes for inference store")
+    num_writers: int = Field(default=4, description="Number of concurrent background writers")
+
+
 class StackRunConfig(BaseModel):
     version: int = LLAMA_STACK_RUN_CONFIG_VERSION
 
@@ -464,11 +470,12 @@ Configuration for the persistence store used by the distribution registry. If no
 a default SQLite store will be used.""",
     )
 
-    inference_store: SqlStoreConfig | None = Field(
+    inference_store: InferenceStoreConfig | SqlStoreConfig | None = Field(
         default=None,
         description="""
-Configuration for the persistence store used by the inference API. If not specified,
-a default SQLite store will be used.""",
+Configuration for the persistence store used by the inference API. Can be either a
+InferenceStoreConfig (with queue tuning parameters) or a SqlStoreConfig (deprecated).
+If not specified, a default SQLite store will be used.""",
     )
 
     # registry of "resources" in the distribution
diff --git a/llama_stack/core/routers/__init__.py b/llama_stack/core/routers/__init__.py
index 1faace34a..f129f8ede 100644
--- a/llama_stack/core/routers/__init__.py
+++ b/llama_stack/core/routers/__init__.py
@@ -78,7 +78,10 @@ async def get_auto_router_impl(
 
     # TODO: move pass configs to routers instead
     if api == Api.inference and run_config.inference_store:
-        inference_store = InferenceStore(run_config.inference_store, policy)
+        inference_store = InferenceStore(
+            config=run_config.inference_store,
+            policy=policy,
+        )
         await inference_store.initialize()
         api_to_dep_impl["store"] = inference_store
 
diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py
index 2ed2d0439..762d7073e 100644
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@@ -90,6 +90,11 @@ class InferenceRouter(Inference):
 
     async def shutdown(self) -> None:
         logger.debug("InferenceRouter.shutdown")
+        if self.store:
+            try:
+                await self.store.shutdown()
+            except Exception as e:
+                logger.warning(f"Error during InferenceStore shutdown: {e}")
 
     async def register_model(
         self,
diff --git a/llama_stack/providers/utils/inference/inference_store.py b/llama_stack/providers/utils/inference/inference_store.py
index 43006cfd5..8c69b1683 100644
--- a/llama_stack/providers/utils/inference/inference_store.py
+++ b/llama_stack/providers/utils/inference/inference_store.py
@@ -3,6 +3,9 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import asyncio
+from typing import Any
+
 from llama_stack.apis.inference import (
     ListOpenAIChatCompletionResponse,
     OpenAIChatCompletion,
@@ -10,24 +13,43 @@ from llama_stack.apis.inference import (
     OpenAIMessageParam,
     Order,
 )
-from llama_stack.core.datatypes import AccessRule
-from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR
+from llama_stack.core.datatypes import AccessRule, InferenceStoreConfig
+from llama_stack.log import get_logger
 
 from ..sqlstore.api import ColumnDefinition, ColumnType
 from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
-from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl
+from ..sqlstore.sqlstore import SqlStoreConfig, SqlStoreType, sqlstore_impl
+
+logger = get_logger(name=__name__, category="inference_store")
 
 
 class InferenceStore:
-    def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]):
-        if not sql_store_config:
-            sql_store_config = SqliteSqlStoreConfig(
-                db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
+    def __init__(
+        self,
+        config: InferenceStoreConfig | SqlStoreConfig,
+        policy: list[AccessRule],
+    ):
+        # Handle backward compatibility
+        if not isinstance(config, InferenceStoreConfig):
+            # Legacy: SqlStoreConfig passed directly as config
+            config = InferenceStoreConfig(
+                sql_store_config=config,
             )
-        self.sql_store_config = sql_store_config
+
+        self.config = config
+        self.sql_store_config = config.sql_store_config
         self.sql_store = None
         self.policy = policy
 
+        # Disable write queue for SQLite to avoid concurrency issues
+        self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite
+
+        # Async write queue and worker control
+        self._queue: asyncio.Queue[tuple[OpenAIChatCompletion, list[OpenAIMessageParam]]] | None = None
+        self._worker_tasks: list[asyncio.Task[Any]] = []
+        self._max_write_queue_size: int = config.max_write_queue_size
+        self._num_writers: int = max(1, config.num_writers)
+
     async def initialize(self):
         """Create the necessary tables if they don't exist."""
         self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config))
@@ -42,10 +64,68 @@ class InferenceStore:
             },
         )
 
+        if self.enable_write_queue:
+            self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
+            for _ in range(self._num_writers):
+                self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
+        else:
+            logger.info("Write queue disabled for SQLite to avoid concurrency issues")
+
+    async def shutdown(self) -> None:
+        if not self._worker_tasks:
+            return
+        if self._queue is not None:
+            await self._queue.join()
+        for t in self._worker_tasks:
+            if not t.done():
+                t.cancel()
+        for t in self._worker_tasks:
+            try:
+                await t
+            except asyncio.CancelledError:
+                pass
+        self._worker_tasks.clear()
+
+    async def flush(self) -> None:
+        """Wait for all queued writes to complete. Useful for testing."""
+        if self.enable_write_queue and self._queue is not None:
+            await self._queue.join()
+
     async def store_chat_completion(
         self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam]
     ) -> None:
-        if not self.sql_store:
+        if self.enable_write_queue:
+            if self._queue is None:
+                raise ValueError("Inference store is not initialized")
+            try:
+                self._queue.put_nowait((chat_completion, input_messages))
+            except asyncio.QueueFull:
+                logger.warning(
+                    f"Write queue full; adding chat completion id={getattr(chat_completion, 'id', '<unknown>')}"
+                )
+                await self._queue.put((chat_completion, input_messages))
+        else:
+            await self._write_chat_completion(chat_completion, input_messages)
+
+    async def _worker_loop(self) -> None:
+        assert self._queue is not None
+        while True:
+            try:
+                item = await self._queue.get()
+            except asyncio.CancelledError:
+                break
+            chat_completion, input_messages = item
+            try:
+                await self._write_chat_completion(chat_completion, input_messages)
+            except Exception as e:  # noqa: BLE001
+                logger.error(f"Error writing chat completion: {e}")
+            finally:
+                self._queue.task_done()
+
+    async def _write_chat_completion(
+        self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam]
+    ) -> None:
+        if self.sql_store is None:
             raise ValueError("Inference store is not initialized")
 
         data = chat_completion.model_dump()
diff --git a/tests/unit/utils/inference/test_inference_store.py b/tests/unit/utils/inference/test_inference_store.py
index 730f54a05..f6d63490a 100644
--- a/tests/unit/utils/inference/test_inference_store.py
+++ b/tests/unit/utils/inference/test_inference_store.py
@@ -65,6 +65,9 @@ async def test_inference_store_pagination_basic():
             input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
             await store.store_chat_completion(completion, input_messages)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test 1: First page with limit=2, descending order (default)
         result = await store.list_chat_completions(limit=2, order=Order.desc)
         assert len(result.data) == 2
@@ -108,6 +111,9 @@ async def test_inference_store_pagination_ascending():
             input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
             await store.store_chat_completion(completion, input_messages)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test ascending order pagination
         result = await store.list_chat_completions(limit=1, order=Order.asc)
         assert len(result.data) == 1
@@ -143,6 +149,9 @@ async def test_inference_store_pagination_with_model_filter():
             input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
             await store.store_chat_completion(completion, input_messages)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test pagination with model filter
         result = await store.list_chat_completions(limit=1, model="model-a", order=Order.desc)
         assert len(result.data) == 1
@@ -190,6 +199,9 @@ async def test_inference_store_pagination_no_limit():
             input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
             await store.store_chat_completion(completion, input_messages)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test without limit
         result = await store.list_chat_completions(order=Order.desc)
         assert len(result.data) == 2

From 7394828c7a84de2c3af0ca37546db17d6a703507 Mon Sep 17 00:00:00 2001
From: Alexey Rybak <50731695+reluctantfuturist@users.noreply.github.com>
Date: Wed, 10 Sep 2025 12:43:36 -0700
Subject: [PATCH 06/30] docs: horizontal nav bar (#3407)

# What does this PR do?
* Adds a horizontal nav bar for easy access to the API reference and the
Llama Stack Github repo

<img width="2696" height="520" alt="image"
src="https://github.com/user-attachments/assets/82daffe1-c206-4e20-b95b-1e090011eecc"
/>

## Test Plan
* Built the docs and ran the local HTML server to verify changes
---
 docs/_static/css/my_theme.css     | 101 ++++++++++++++++++++++++++++++
 docs/_static/js/horizontal_nav.js |  44 +++++++++++++
 docs/source/conf.py               |   1 +
 3 files changed, 146 insertions(+)
 create mode 100644 docs/_static/js/horizontal_nav.js

diff --git a/docs/_static/css/my_theme.css b/docs/_static/css/my_theme.css
index d078ec057..7dcd97c9b 100644
--- a/docs/_static/css/my_theme.css
+++ b/docs/_static/css/my_theme.css
@@ -1,5 +1,106 @@
 @import url("theme.css");
 
+/* Horizontal Navigation Bar */
+.horizontal-nav {
+    background-color: #ffffff;
+    border-bottom: 1px solid #e5e5e5;
+    padding: 0;
+    position: fixed;
+    top: 0;
+    left: 0;
+    right: 0;
+    z-index: 1050;
+    height: 50px;
+    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+}
+
+[data-theme="dark"] .horizontal-nav {
+    background-color: #1a1a1a;
+    border-bottom: 1px solid #333;
+}
+
+.horizontal-nav .nav-container {
+    max-width: 1200px;
+    margin: 0 auto;
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    padding: 0 20px;
+    height: 100%;
+}
+
+.horizontal-nav .nav-brand {
+    font-size: 18px;
+    font-weight: 600;
+    color: #333;
+    text-decoration: none;
+}
+
+[data-theme="dark"] .horizontal-nav .nav-brand {
+    color: #fff;
+}
+
+.horizontal-nav .nav-links {
+    display: flex;
+    align-items: center;
+    gap: 30px;
+    list-style: none;
+    margin: 0;
+    padding: 0;
+}
+
+.horizontal-nav .nav-links a {
+    color: #666;
+    text-decoration: none;
+    font-size: 14px;
+    font-weight: 500;
+    padding: 8px 12px;
+    border-radius: 6px;
+    transition: all 0.2s ease;
+}
+
+.horizontal-nav .nav-links a:hover,
+.horizontal-nav .nav-links a.active {
+    color: #333;
+    background-color: #f5f5f5;
+}
+
+.horizontal-nav .nav-links a.active {
+    font-weight: 600;
+}
+
+[data-theme="dark"] .horizontal-nav .nav-links a {
+    color: #ccc;
+}
+
+[data-theme="dark"] .horizontal-nav .nav-links a:hover,
+[data-theme="dark"] .horizontal-nav .nav-links a.active {
+    color: #fff;
+    background-color: #333;
+}
+
+.horizontal-nav .nav-links .github-link {
+    display: flex;
+    align-items: center;
+    gap: 6px;
+}
+
+.horizontal-nav .nav-links .github-icon {
+    width: 16px;
+    height: 16px;
+    fill: currentColor;
+}
+
+/* Adjust main content to account for fixed nav */
+.wy-nav-side {
+    top: 50px;
+    height: calc(100vh - 50px);
+}
+
+.wy-nav-content-wrap {
+    margin-top: 50px;
+}
+
 .wy-nav-content {
     max-width: 90%;
 }
diff --git a/docs/_static/js/horizontal_nav.js b/docs/_static/js/horizontal_nav.js
new file mode 100644
index 000000000..c2384f9d5
--- /dev/null
+++ b/docs/_static/js/horizontal_nav.js
@@ -0,0 +1,44 @@
+// Horizontal Navigation Bar for Llama Stack Documentation
+document.addEventListener('DOMContentLoaded', function() {
+    // Create the horizontal navigation HTML
+    const navHTML = `
+        <nav class="horizontal-nav">
+            <div class="nav-container">
+                <a href="/" class="nav-brand">Llama Stack</a>
+                <ul class="nav-links">
+                    <li><a href="/">Docs</a></li>
+                    <li><a href="/references/api_reference/">API Reference</a></li>
+                    <li><a href="https://github.com/meta-llama/llama-stack" target="_blank" class="github-link">
+                        <svg class="github-icon" viewBox="0 0 16 16" aria-hidden="true">
+                            <path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"/>
+                        </svg>
+                        GitHub
+                    </a></li>
+                </ul>
+            </div>
+        </nav>
+    `;
+
+    // Insert the navigation at the beginning of the body
+    document.body.insertAdjacentHTML('afterbegin', navHTML);
+
+    // Update navigation links based on current page
+    updateActiveNav();
+});
+
+function updateActiveNav() {
+    const currentPath = window.location.pathname;
+    const navLinks = document.querySelectorAll('.horizontal-nav .nav-links a');
+
+    navLinks.forEach(link => {
+        // Remove any existing active classes
+        link.classList.remove('active');
+
+        // Add active class based on current path
+        if (currentPath === '/' && link.getAttribute('href') === '/') {
+            link.classList.add('active');
+        } else if (currentPath.includes('/references/api_reference/') && link.getAttribute('href').includes('api_reference')) {
+            link.classList.add('active');
+        }
+    });
+}
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 3f84d1310..0cbddef31 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -131,6 +131,7 @@ html_static_path = ["../_static"]
 def setup(app):
     app.add_css_file("css/my_theme.css")
     app.add_js_file("js/detect_theme.js")
+    app.add_js_file("js/horizontal_nav.js")
     app.add_js_file("js/keyboard_shortcuts.js")
 
     def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):

From a844c4f6e189395f99a6470552876d1ba6b807f1 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 10 Sep 2025 13:17:02 -0700
Subject: [PATCH 07/30] chore(python-deps): bump pytest from 8.4.1 to 8.4.2
 (#3359)

Bumps [pytest](https://github.com/pytest-dev/pytest) from 8.4.1 to
8.4.2.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/pytest-dev/pytest/releases">pytest's
releases</a>.</em></p>
<blockquote>
<h2>8.4.2</h2>
<h1>pytest 8.4.2 (2025-09-03)</h1>
<h2>Bug fixes</h2>
<ul>
<li>
<p><a
href="https://redirect.github.com/pytest-dev/pytest/issues/13478">#13478</a>:
Fixed a crash when using
<code>console_output_style</code>{.interpreted-text
role=&quot;confval&quot;} with <code>times</code> and a module is
skipped.</p>
</li>
<li>
<p><a
href="https://redirect.github.com/pytest-dev/pytest/issues/13530">#13530</a>:
Fixed a crash when using <code>pytest.approx</code>{.interpreted-text
role=&quot;func&quot;} and
<code>decimal.Decimal</code>{.interpreted-text role=&quot;class&quot;}
instances with the <code>decimal.FloatOperation</code>{.interpreted-text
role=&quot;class&quot;} trap set.</p>
</li>
<li>
<p><a
href="https://redirect.github.com/pytest-dev/pytest/issues/13549">#13549</a>:
No longer evaluate type annotations in Python <code>3.14</code> when
inspecting function signatures.</p>
<p>This prevents crashes during module collection when modules do not
explicitly use <code>from __future__ import annotations</code> and
import types for annotations within a <code>if TYPE_CHECKING:</code>
block.</p>
</li>
<li>
<p><a
href="https://redirect.github.com/pytest-dev/pytest/issues/13559">#13559</a>:
Added missing [int]{.title-ref} and [float]{.title-ref} variants to the
[Literal]{.title-ref} type annotation of the [type]{.title-ref}
parameter in <code>pytest.Parser.addini</code>{.interpreted-text
role=&quot;meth&quot;}.</p>
</li>
<li>
<p><a
href="https://redirect.github.com/pytest-dev/pytest/issues/13563">#13563</a>:
<code>pytest.approx</code>{.interpreted-text role=&quot;func&quot;} now
only imports <code>numpy</code> if NumPy is already in
<code>sys.modules</code>. This fixes unconditional import behavior
introduced in [8.4.0]{.title-ref}.</p>
</li>
</ul>
<h2>Improved documentation</h2>
<ul>
<li><a
href="https://redirect.github.com/pytest-dev/pytest/issues/13577">#13577</a>:
Clarify that <code>pytest_generate_tests</code> is discovered in test
modules/classes; other hooks must be in <code>conftest.py</code> or
plugins.</li>
</ul>
<h2>Contributor-facing changes</h2>
<ul>
<li><a
href="https://redirect.github.com/pytest-dev/pytest/issues/13480">#13480</a>:
Self-testing: fixed a few test failures when run with
<code>-Wdefault</code> or a similar override.</li>
<li><a
href="https://redirect.github.com/pytest-dev/pytest/issues/13547">#13547</a>:
Self-testing: corrected expected message for
<code>test_doctest_unexpected_exception</code> in Python
<code>3.14</code>.</li>
<li><a
href="https://redirect.github.com/pytest-dev/pytest/issues/13684">#13684</a>:
Make pytest's own testsuite insensitive to the presence of the
<code>CI</code> environment variable -- by
<code>ogrisel</code>{.interpreted-text role=&quot;user&quot;}.</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/pytest-dev/pytest/commit/bfae4224fd554d3d7f2c277a4cc092b6ec6af3ae"><code>bfae422</code></a>
Prepare release version 8.4.2</li>
<li><a
href="https://github.com/pytest-dev/pytest/commit/89905381a163be30ae87d62e5f750e902d750c5f"><code>8990538</code></a>
Fix passenv CI in tox ini and make tests insensitive to the presence of
the C...</li>
<li><a
href="https://github.com/pytest-dev/pytest/commit/ca676bfe005aebcb12f4146d1b0f1d2772e2cd5d"><code>ca676bf</code></a>
Merge pull request <a
href="https://redirect.github.com/pytest-dev/pytest/issues/13687">#13687</a>
from pytest-dev/patchback/backports/8.4.x/e63f6e51c...</li>
<li><a
href="https://github.com/pytest-dev/pytest/commit/975a60a63ce385a44655596e254c1899feaa53e4"><code>975a60a</code></a>
Merge pull request <a
href="https://redirect.github.com/pytest-dev/pytest/issues/13686">#13686</a>
from pytest-dev/patchback/backports/8.4.x/12bde8af6...</li>
<li><a
href="https://github.com/pytest-dev/pytest/commit/7723ce84b87ab08f86ddafcb342acc28ba5ec99d"><code>7723ce8</code></a>
Merge pull request <a
href="https://redirect.github.com/pytest-dev/pytest/issues/13683">#13683</a>
from even-even/fix_Exeption_to_Exception_in_errorMe...</li>
<li><a
href="https://github.com/pytest-dev/pytest/commit/b7f05680d1301e0969b30bcb3c4b27433c9ee2b7"><code>b7f0568</code></a>
Merge pull request <a
href="https://redirect.github.com/pytest-dev/pytest/issues/13685">#13685</a>
from CoretexShadow/fix/docs-pytest-generate-tests</li>
<li><a
href="https://github.com/pytest-dev/pytest/commit/2c94c4a6948ba53440818389298157fa5d5f94cd"><code>2c94c4a</code></a>
add missing colon (<a
href="https://redirect.github.com/pytest-dev/pytest/issues/13640">#13640</a>)
(<a
href="https://redirect.github.com/pytest-dev/pytest/issues/13641">#13641</a>)</li>
<li><a
href="https://github.com/pytest-dev/pytest/commit/c3d7684bc01c8c48d05145a30c5211ca8656c68c"><code>c3d7684</code></a>
Merge pull request <a
href="https://redirect.github.com/pytest-dev/pytest/issues/13606">#13606</a>
from pytest-dev/patchback/backports/8.4.x/5f9938563...</li>
<li><a
href="https://github.com/pytest-dev/pytest/commit/dc6e3be2ddc75a149b6d102d9b7c82ee47a00cfa"><code>dc6e3be</code></a>
Merge pull request <a
href="https://redirect.github.com/pytest-dev/pytest/issues/13605">#13605</a>
from The-Compiler/training-update-2025-07</li>
<li><a
href="https://github.com/pytest-dev/pytest/commit/f87289c36c8dbe7740e3020f5546b6f8b0861ff0"><code>f87289c</code></a>
Fix crash with <code>times</code> output style and skipped module (<a
href="https://redirect.github.com/pytest-dev/pytest/issues/13573">#13573</a>)
(<a
href="https://redirect.github.com/pytest-dev/pytest/issues/13579">#13579</a>)</li>
<li>Additional commits viewable in <a
href="https://github.com/pytest-dev/pytest/compare/8.4.1...8.4.2">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=pytest&package-manager=uv&previous-version=8.4.1&new-version=8.4.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 uv.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/uv.lock b/uv.lock
index 2788c6fef..6f8ba7ad6 100644
--- a/uv.lock
+++ b/uv.lock
@@ -3540,7 +3540,7 @@ wheels = [
 
 [[package]]
 name = "pytest"
-version = "8.4.1"
+version = "8.4.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "colorama", marker = "sys_platform == 'win32'" },
@@ -3549,9 +3549,9 @@ dependencies = [
     { name = "pluggy" },
     { name = "pygments" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/08/ba/45911d754e8eba3d5a841a5ce61a65a685ff1798421ac054f85aa8747dfb/pytest-8.4.1.tar.gz", hash = "sha256:7c67fd69174877359ed9371ec3af8a3d2b04741818c51e5e99cc1742251fa93c", size = 1517714, upload-time = "2025-06-18T05:48:06.109Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7", size = 365474, upload-time = "2025-06-18T05:48:03.955Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" },
 ]
 
 [[package]]

From 369083c0699270d7a3fa4d10f4975a081fcc7acd Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 10 Sep 2025 13:17:28 -0700
Subject: [PATCH 08/30] chore(python-deps): bump locust from 2.39.1 to 2.40.1
 (#3358)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [locust](https://github.com/locustio/locust) from 2.39.1 to
2.40.1.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/locustio/locust/releases">locust's
releases</a>.</em></p>
<blockquote>
<h2>2.40.1</h2>
<h2>What's Changed</h2>
<ul>
<li>Pytest plugin: Delay imports to avoid monkey patching until someone
uses the fixtures by <a
href="https://github.com/cyberw"><code>@​cyberw</code></a> in <a
href="https://redirect.github.com/locustio/locust/pull/3204">locustio/locust#3204</a></li>
<li>Move pytest plugin to its own directory, to prevent accidental
import by <a href="https://github.com/cyberw"><code>@​cyberw</code></a>
in <a
href="https://redirect.github.com/locustio/locust/pull/3205">locustio/locust#3205</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/locustio/locust/compare/2.40.0...2.40.1">https://github.com/locustio/locust/compare/2.40.0...2.40.1</a></p>
<h2>2.40.0</h2>
<h2>What's Changed</h2>
<ul>
<li>Refactor FastHttpSession to be more like HttpSession by <a
href="https://github.com/cyberw"><code>@​cyberw</code></a> in <a
href="https://redirect.github.com/locustio/locust/pull/3198">locustio/locust#3198</a></li>
<li>Update Dockerfile base to Python 3.13 by <a
href="https://github.com/adaamz"><code>@​adaamz</code></a> in <a
href="https://redirect.github.com/locustio/locust/pull/3193">locustio/locust#3193</a></li>
<li>Avoid exception in HttpUser if requests has lost track of the
request it made by <a
href="https://github.com/cyberw"><code>@​cyberw</code></a> in <a
href="https://redirect.github.com/locustio/locust/pull/3201">locustio/locust#3201</a></li>
<li>Support pytests as locustfiles by <a
href="https://github.com/cyberw"><code>@​cyberw</code></a> in <a
href="https://redirect.github.com/locustio/locust/pull/3200">locustio/locust#3200</a></li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/adaamz"><code>@​adaamz</code></a> made
their first contribution in <a
href="https://redirect.github.com/locustio/locust/pull/3193">locustio/locust#3193</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/locustio/locust/compare/2.39.1...2.40.0">https://github.com/locustio/locust/compare/2.39.1...2.40.0</a></p>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/locustio/locust/blob/master/CHANGELOG.md">locust's
changelog</a>.</em></p>
<blockquote>
<h1>Detailed changelog</h1>
<p>The most important changes can also be found in <a
href="https://docs.locust.io/en/latest/changelog.html">the
documentation</a>.</p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/locustio/locust/commit/5df19da06a0f8418d44094f3014ddee3b307bb55"><code>5df19da</code></a>
Merge pull request <a
href="https://redirect.github.com/locustio/locust/issues/3205">#3205</a>
from locustio/move-pytest-plugin-to-own-directory</li>
<li><a
href="https://github.com/locustio/locust/commit/d41141bedd3bbafd894adccaa3cf05b7cf0990fd"><code>d41141b</code></a>
Move pytest plugin to its own directory, to prevent accidental import of
locu...</li>
<li><a
href="https://github.com/locustio/locust/commit/6422848afd072032005f2eafa9ed4db11238708a"><code>6422848</code></a>
mention that only one locustfile can be distributed</li>
<li><a
href="https://github.com/locustio/locust/commit/aa3da739fe37baf4f31a99a89776e4380aabd13b"><code>aa3da73</code></a>
Merge pull request <a
href="https://redirect.github.com/locustio/locust/issues/3204">#3204</a>
from locustio/delay-imports-in-pytest-plugin-to-avoi...</li>
<li><a
href="https://github.com/locustio/locust/commit/12050dedfd5925beed067eba5a9732d2cd4865ca"><code>12050de</code></a>
Pytest plugin: Delay imports to avoid monkey patching until someone
actually ...</li>
<li><a
href="https://github.com/locustio/locust/commit/488d1f849121335ce36d502f7cfe94c82cbea499"><code>488d1f8</code></a>
docs</li>
<li><a
href="https://github.com/locustio/locust/commit/439b7ab91bc8dd00a423d111c452133bdaaae872"><code>439b7ab</code></a>
docs fix</li>
<li><a
href="https://github.com/locustio/locust/commit/fcd76a8ac341c83cd0a2a315d2942c12e43a8d9f"><code>fcd76a8</code></a>
docs: rephrase</li>
<li><a
href="https://github.com/locustio/locust/commit/70c7e9b2d8326af33b5253f163a2d2e7e9e8e6dd"><code>70c7e9b</code></a>
docs: move pytest further up</li>
<li><a
href="https://github.com/locustio/locust/commit/06dbf98013f56b22451158e0b23c9014dc9814f9"><code>06dbf98</code></a>
docs: fix link</li>
<li>Additional commits viewable in <a
href="https://github.com/locustio/locust/compare/2.39.1...2.40.1">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=locust&package-manager=uv&previous-version=2.39.1&new-version=2.40.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 uv.lock | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/uv.lock b/uv.lock
index 6f8ba7ad6..df3a23e58 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2023,7 +2023,7 @@ wheels = [
 
 [[package]]
 name = "locust"
-version = "2.39.1"
+version = "2.40.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "configargparse" },
@@ -2035,6 +2035,7 @@ dependencies = [
     { name = "locust-cloud" },
     { name = "msgpack" },
     { name = "psutil" },
+    { name = "pytest" },
     { name = "python-engineio" },
     { name = "python-socketio", extra = ["client"] },
     { name = "pywin32", marker = "sys_platform == 'win32'" },
@@ -2043,9 +2044,9 @@ dependencies = [
     { name = "setuptools" },
     { name = "werkzeug" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/95/c8/10aa5445c404eed389b56877e6714c1787190cc09dd70059ce3765979ec5/locust-2.39.1.tar.gz", hash = "sha256:6bdd19e27edf9a1c84391d6cf6e9a737dfb832be7dfbf39053191ae31b9cc498", size = 1409902, upload-time = "2025-08-29T17:41:01.544Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/01/22/82f40176473a98c9479bed667d3ad21bb859d2cb67f6880a6b0b6a725e45/locust-2.40.1.tar.gz", hash = "sha256:5bde76c1cf7e412071670f926f34844e119210c93f07a4cf9fc4cb93c60a578a", size = 1411606, upload-time = "2025-09-05T15:57:35.76Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ec/b3/b2f4b2ca88b1e72eba7be2b2982533b887f8b709d222db78eb9602aa5121/locust-2.39.1-py3-none-any.whl", hash = "sha256:fd5148f2f1a4ed34aee968abc4393674e69d1b5e1b54db50a397f6eb09ce0b04", size = 1428155, upload-time = "2025-08-29T17:41:00.245Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/e6/9c6335ab16becf4f8ad3da6083ab78793c56ec1ca496d6f7c74660c21c3f/locust-2.40.1-py3-none-any.whl", hash = "sha256:ef0517f9bb5ed0afa7035014faaf944802917e07da8649461aaaf5e5f3ba8a65", size = 1430154, upload-time = "2025-09-05T15:57:33.233Z" },
 ]
 
 [[package]]

From 438c037b1f16ee8123ab71b2aa39529ce32967a5 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 10 Sep 2025 13:17:43 -0700
Subject: [PATCH 09/30] chore(python-deps): bump openai from 1.102.0 to 1.106.1
 (#3356)

Bumps [openai](https://github.com/openai/openai-python) from 1.102.0 to
1.106.1.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/openai/openai-python/releases">openai's
releases</a>.</em></p>
<blockquote>
<h2>v1.106.1</h2>
<h2>1.106.1 (2025-09-04)</h2>
<p>Full Changelog: <a
href="https://github.com/openai/openai-python/compare/v1.106.0...v1.106.1">v1.106.0...v1.106.1</a></p>
<h3>Chores</h3>
<ul>
<li><strong>internal:</strong> move mypy configurations to
<code>pyproject.toml</code> file (<a
href="https://github.com/openai/openai-python/commit/ca413a277496c3b883b103ad1138a886e89ae15e">ca413a2</a>)</li>
</ul>
<h2>v1.106.0</h2>
<h2>1.106.0 (2025-09-04)</h2>
<p>Full Changelog: <a
href="https://github.com/openai/openai-python/compare/v1.105.0...v1.106.0">v1.105.0...v1.106.0</a></p>
<h3>Features</h3>
<ul>
<li><strong>client:</strong> support callable api_key (<a
href="https://redirect.github.com/openai/openai-python/issues/2588">#2588</a>)
(<a
href="https://github.com/openai/openai-python/commit/e1bad015b8a2b98bfee955a24bc931347a58efc1">e1bad01</a>)</li>
<li>improve future compat with pydantic v3 (<a
href="https://github.com/openai/openai-python/commit/6645d9317a240982928b92c2f4af0381db6edc09">6645d93</a>)</li>
</ul>
<h2>v1.105.0</h2>
<h2>1.105.0 (2025-09-03)</h2>
<p>Full Changelog: <a
href="https://github.com/openai/openai-python/compare/v1.104.2...v1.105.0">v1.104.2...v1.105.0</a></p>
<h3>Features</h3>
<ul>
<li><strong>api:</strong> Add gpt-realtime models (<a
href="https://github.com/openai/openai-python/commit/85020414808314df9cb42e020b11baff12f18f16">8502041</a>)</li>
</ul>
<h2>v1.104.2</h2>
<h2>1.104.2 (2025-09-02)</h2>
<p>Full Changelog: <a
href="https://github.com/openai/openai-python/compare/v1.104.1...v1.104.2">v1.104.1...v1.104.2</a></p>
<h3>Bug Fixes</h3>
<ul>
<li><strong>types:</strong> add aliases back for web search tool types
(<a
href="https://github.com/openai/openai-python/commit/2521cd8445906e418dbae783b0d7c375ad91d49d">2521cd8</a>)</li>
</ul>
<h2>v1.104.1</h2>
<h2>1.104.1 (2025-09-02)</h2>
<p>Full Changelog: <a
href="https://github.com/openai/openai-python/compare/v1.104.0...v1.104.1">v1.104.0...v1.104.1</a></p>
<h3>Chores</h3>
<ul>
<li><strong>api:</strong> manual updates for ResponseInputAudio (<a
href="https://github.com/openai/openai-python/commit/0db50619663656ba97bba30ab640bbb33683d196">0db5061</a>)</li>
</ul>
<h2>v1.104.0</h2>
<h2>1.104.0 (2025-09-02)</h2>
<p>Full Changelog: <a
href="https://github.com/openai/openai-python/compare/v1.103.0...v1.104.0">v1.103.0...v1.104.0</a></p>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/openai/openai-python/blob/main/CHANGELOG.md">openai's
changelog</a>.</em></p>
<blockquote>
<h2>1.106.1 (2025-09-04)</h2>
<p>Full Changelog: <a
href="https://github.com/openai/openai-python/compare/v1.106.0...v1.106.1">v1.106.0...v1.106.1</a></p>
<h3>Chores</h3>
<ul>
<li><strong>internal:</strong> move mypy configurations to
<code>pyproject.toml</code> file (<a
href="https://github.com/openai/openai-python/commit/ca413a277496c3b883b103ad1138a886e89ae15e">ca413a2</a>)</li>
</ul>
<h2>1.106.0 (2025-09-04)</h2>
<p>Full Changelog: <a
href="https://github.com/openai/openai-python/compare/v1.105.0...v1.106.0">v1.105.0...v1.106.0</a></p>
<h3>Features</h3>
<ul>
<li><strong>client:</strong> support callable api_key (<a
href="https://redirect.github.com/openai/openai-python/issues/2588">#2588</a>)
(<a
href="https://github.com/openai/openai-python/commit/e1bad015b8a2b98bfee955a24bc931347a58efc1">e1bad01</a>)</li>
<li>improve future compat with pydantic v3 (<a
href="https://github.com/openai/openai-python/commit/6645d9317a240982928b92c2f4af0381db6edc09">6645d93</a>)</li>
</ul>
<h2>1.105.0 (2025-09-03)</h2>
<p>Full Changelog: <a
href="https://github.com/openai/openai-python/compare/v1.104.2...v1.105.0">v1.104.2...v1.105.0</a></p>
<h3>Features</h3>
<ul>
<li><strong>api:</strong> Add gpt-realtime models (<a
href="https://github.com/openai/openai-python/commit/85020414808314df9cb42e020b11baff12f18f16">8502041</a>)</li>
</ul>
<h2>1.104.2 (2025-09-02)</h2>
<p>Full Changelog: <a
href="https://github.com/openai/openai-python/compare/v1.104.1...v1.104.2">v1.104.1...v1.104.2</a></p>
<h3>Bug Fixes</h3>
<ul>
<li><strong>types:</strong> add aliases back for web search tool types
(<a
href="https://github.com/openai/openai-python/commit/2521cd8445906e418dbae783b0d7c375ad91d49d">2521cd8</a>)</li>
</ul>
<h2>1.104.1 (2025-09-02)</h2>
<p>Full Changelog: <a
href="https://github.com/openai/openai-python/compare/v1.104.0...v1.104.1">v1.104.0...v1.104.1</a></p>
<h3>Chores</h3>
<ul>
<li><strong>api:</strong> manual updates for ResponseInputAudio (<a
href="https://github.com/openai/openai-python/commit/0db50619663656ba97bba30ab640bbb33683d196">0db5061</a>)</li>
</ul>
<h2>1.104.0 (2025-09-02)</h2>
<p>Full Changelog: <a
href="https://github.com/openai/openai-python/compare/v1.103.0...v1.104.0">v1.103.0...v1.104.0</a></p>
<h3>Features</h3>
<ul>
<li><strong>types:</strong> replace List[str] with SequenceNotStr in
params (<a
href="https://github.com/openai/openai-python/commit/bc00bda880a80089be8a1758c016266ca72dab2c">bc00bda</a>)</li>
</ul>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/openai/openai-python/commit/2adf11112988e998fcf5adb805bae38501d22318"><code>2adf111</code></a>
release: 1.106.1</li>
<li><a
href="https://github.com/openai/openai-python/commit/c4f9d0b997e18614709752e030f85d9e8281b4e0"><code>c4f9d0b</code></a>
chore(internal): move mypy configurations to <code>pyproject.toml</code>
file</li>
<li><a
href="https://github.com/openai/openai-python/commit/2de8d7cde5565ec71851d8bc3a26f021cebab32c"><code>2de8d7c</code></a>
release: 1.106.0</li>
<li><a
href="https://github.com/openai/openai-python/commit/2cf4ed5072f89103c674a61d22879b06a4c407f6"><code>2cf4ed5</code></a>
feat: improve future compat with pydantic v3</li>
<li><a
href="https://github.com/openai/openai-python/commit/25d16be18bcd11e00a853e8f4af881c76098e0d0"><code>25d16be</code></a>
feat(client): support callable api_key (<a
href="https://redirect.github.com/openai/openai-python/issues/2588">#2588</a>)</li>
<li><a
href="https://github.com/openai/openai-python/commit/8672413735889e83e74e7e133b976fe6029843a5"><code>8672413</code></a>
release: 1.105.0</li>
<li><a
href="https://github.com/openai/openai-python/commit/2c60d78b378465433b70bbe2a7d3f94c8eeaa0d5"><code>2c60d78</code></a>
feat(api): Add gpt-realtime models</li>
<li><a
href="https://github.com/openai/openai-python/commit/a52463c93215a09f9a142e25c975935523d15c10"><code>a52463c</code></a>
release: 1.104.2</li>
<li><a
href="https://github.com/openai/openai-python/commit/5a6931dafdf73d9dbfce62c3a7c585b95daaf009"><code>5a6931d</code></a>
fix(types): add aliases back for web search tool types</li>
<li><a
href="https://github.com/openai/openai-python/commit/fb152d967edb181c1a17827f31a4df10e416e255"><code>fb152d9</code></a>
release: 1.104.1</li>
<li>Additional commits viewable in <a
href="https://github.com/openai/openai-python/compare/v1.102.0...v1.106.1">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=openai&package-manager=uv&previous-version=1.102.0&new-version=1.106.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 uv.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/uv.lock b/uv.lock
index df3a23e58..3d7713f54 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2620,7 +2620,7 @@ wheels = [
 
 [[package]]
 name = "openai"
-version = "1.102.0"
+version = "1.107.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -2632,9 +2632,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/07/55/da5598ed5c6bdd9939633854049cddc5cbac0da938dfcfcb3c6b119c16c0/openai-1.102.0.tar.gz", hash = "sha256:2e0153bcd64a6523071e90211cbfca1f2bbc5ceedd0993ba932a5869f93b7fc9", size = 519027, upload-time = "2025-08-26T20:50:29.397Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/88/67/d6498de300f83ff57a79cb7aa96ef3bef8d6f070c3ded0f1b5b45442a6bc/openai-1.107.0.tar.gz", hash = "sha256:43e04927584e57d0e9e640ee0077c78baf8150098be96ebd5c512539b6c4e9a4", size = 566056, upload-time = "2025-09-08T19:25:47.604Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/bd/0d/c9e7016d82c53c5b5e23e2bad36daebb8921ed44f69c0a985c6529a35106/openai-1.102.0-py3-none-any.whl", hash = "sha256:d751a7e95e222b5325306362ad02a7aa96e1fab3ed05b5888ce1c7ca63451345", size = 812015, upload-time = "2025-08-26T20:50:27.219Z" },
+    { url = "https://files.pythonhosted.org/packages/91/ed/e8a4fd20390f2858b95227c288df8fe0c835f7c77625f7583609161684ba/openai-1.107.0-py3-none-any.whl", hash = "sha256:3dcfa3cbb116bd6924b27913b8da28c4a787379ff60049588547a1013e6d6438", size = 950968, upload-time = "2025-09-08T19:25:45.552Z" },
 ]
 
 [[package]]

From d4e45cd5f1e099d9f6ac2d52ad6cd3f74cc4facf Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 10 Sep 2025 13:18:14 -0700
Subject: [PATCH 10/30] chore(ui-deps): bump tailwindcss from 4.1.6 to 4.1.13
 in /llama_stack/ui (#3362)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps
[tailwindcss](https://github.com/tailwindlabs/tailwindcss/tree/HEAD/packages/tailwindcss)
from 4.1.6 to 4.1.13.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/tailwindlabs/tailwindcss/releases">tailwindcss's
releases</a>.</em></p>
<blockquote>
<h2>v4.1.13</h2>
<h3>Changed</h3>
<ul>
<li>Drop warning from browser build (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/issues/18731">#18731</a>)</li>
<li>Drop exact duplicate declarations when emitting CSS (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/issues/18809">#18809</a>)</li>
</ul>
<h3>Fixed</h3>
<ul>
<li>Don't transition <code>visibility</code> when using
<code>transition</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18795">#18795</a>)</li>
<li>Discard matched variants with unknown named values (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18799">#18799</a>)</li>
<li>Discard matched variants with non-string values (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18799">#18799</a>)</li>
<li>Show suggestions for known <code>matchVariant</code> values (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18798">#18798</a>)</li>
<li>Replace deprecated <code>clip</code> with <code>clip-path</code> in
<code>sr-only</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18769">#18769</a>)</li>
<li>Hide internal fields from completions in <code>matchUtilities</code>
(<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18820">#18820</a>)</li>
<li>Ignore <code>.vercel</code> folders by default (can be overridden by
<code>@source …</code> rules) (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18855">#18855</a>)</li>
<li>Consider variants starting with <code>@-</code> to be invalid (e.g.
<code>@-2xl:flex</code>) (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18869">#18869</a>)</li>
<li>Do not allow custom variants to start or end with a <code>-</code>
or <code>_</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18867">#18867</a>,
<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18872">#18872</a>)</li>
<li>Upgrade: Migrate <code>aria</code> theme keys to
<code>@custom-variant</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18815">#18815</a>)</li>
<li>Upgrade: Migrate <code>data</code> theme keys to
<code>@custom-variant</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18816">#18816</a>)</li>
<li>Upgrade: Migrate <code>supports</code> theme keys to
<code>@custom-variant</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18817">#18817</a>)</li>
</ul>
<h2>v4.1.12</h2>
<h3>Fixed</h3>
<ul>
<li>Don't consider the global important state in <code>@apply</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18404">#18404</a>)</li>
<li>Add missing suggestions for <code>flex-&lt;number&gt;</code>
utilities (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18642">#18642</a>)</li>
<li>Fix trailing <code>)</code> from interfering with extraction in
Clojure keywords (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18345">#18345</a>)</li>
<li>Detect classes inside Elixir charlist, word list, and string sigils
(<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18432">#18432</a>)</li>
<li>Track source locations through <code>@plugin</code> and
<code>@config</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18345">#18345</a>)</li>
<li>Allow boolean values of <code>process.env.DEBUG</code> in
<code>@tailwindcss/node</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18485">#18485</a>)</li>
<li>Ignore consecutive semicolons in the CSS parser (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18532">#18532</a>)</li>
<li>Center the dropdown icon added to an input with a paired datalist by
default (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18511">#18511</a>)</li>
<li>Extract candidates in Slang templates (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18565">#18565</a>)</li>
<li>Improve error messages when encountering invalid functional utility
names (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18568">#18568</a>)</li>
<li>Discard CSS AST objects with <code>false</code> or
<code>undefined</code> properties (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18571">#18571</a>)</li>
<li>Allow users to disable URL rebasing in
<code>@tailwindcss/postcss</code> via <code>transformAssetUrls:
false</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18321">#18321</a>)</li>
<li>Fix false-positive migrations in <code>addEventListener</code> and
JavaScript variable names (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18718">#18718</a>)</li>
<li>Fix Standalone CLI showing default Bun help when run via symlink on
Windows (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18723">#18723</a>)</li>
<li>Read from <code>--border-color-*</code> theme keys in
<code>divide-*</code> utilities for backwards compatibility (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18704/">#18704</a>)</li>
<li>Don't scan <code>.hdr</code> and <code>.exr</code> files for classes
by default (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18734">#18734</a>)</li>
</ul>
<h2>v4.1.11</h2>
<h3>Fixed</h3>
<ul>
<li>Add heuristic to skip candidate migrations inside
<code>emit(…)</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18330">#18330</a>)</li>
<li>Extract candidates with variants in Clojure/ClojureScript keywords
(<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18338">#18338</a>)</li>
<li>Document <code>--watch=always</code> in the CLI's usage (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18337">#18337</a>)</li>
<li>Add support for Vite 7 to <code>@tailwindcss/vite</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18384">#18384</a>)</li>
</ul>
<h2>v4.1.10</h2>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/tailwindlabs/tailwindcss/blob/main/CHANGELOG.md">tailwindcss's
changelog</a>.</em></p>
<blockquote>
<h2>[4.1.13] - 2025-09-03</h2>
<h3>Changed</h3>
<ul>
<li>Drop warning from browser build (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/issues/18731">#18731</a>)</li>
<li>Drop exact duplicate declarations when emitting CSS (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/issues/18809">#18809</a>)</li>
</ul>
<h3>Fixed</h3>
<ul>
<li>Don't transition <code>visibility</code> when using
<code>transition</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18795">#18795</a>)</li>
<li>Discard matched variants with unknown named values (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18799">#18799</a>)</li>
<li>Discard matched variants with non-string values (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18799">#18799</a>)</li>
<li>Show suggestions for known <code>matchVariant</code> values (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18798">#18798</a>)</li>
<li>Replace deprecated <code>clip</code> with <code>clip-path</code> in
<code>sr-only</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18769">#18769</a>)</li>
<li>Hide internal fields from completions in <code>matchUtilities</code>
(<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18820">#18820</a>)</li>
<li>Ignore <code>.vercel</code> folders by default (can be overridden by
<code>@source …</code> rules) (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18855">#18855</a>)</li>
<li>Consider variants starting with <code>@-</code> to be invalid (e.g.
<code>@-2xl:flex</code>) (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18869">#18869</a>)</li>
<li>Do not allow custom variants to start or end with a <code>-</code>
or <code>_</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18867">#18867</a>,
<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18872">#18872</a>)</li>
<li>Upgrade: Migrate <code>aria</code> theme keys to
<code>@custom-variant</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18815">#18815</a>)</li>
<li>Upgrade: Migrate <code>data</code> theme keys to
<code>@custom-variant</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18816">#18816</a>)</li>
<li>Upgrade: Migrate <code>supports</code> theme keys to
<code>@custom-variant</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18817">#18817</a>)</li>
</ul>
<h2>[4.1.12] - 2025-08-13</h2>
<h3>Fixed</h3>
<ul>
<li>Don't consider the global important state in <code>@apply</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18404">#18404</a>)</li>
<li>Add missing suggestions for <code>flex-&lt;number&gt;</code>
utilities (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18642">#18642</a>)</li>
<li>Fix trailing <code>)</code> from interfering with extraction in
Clojure keywords (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18345">#18345</a>)</li>
<li>Detect classes inside Elixir charlist, word list, and string sigils
(<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18432">#18432</a>)</li>
<li>Track source locations through <code>@plugin</code> and
<code>@config</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18345">#18345</a>)</li>
<li>Allow boolean values of <code>process.env.DEBUG</code> in
<code>@tailwindcss/node</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18485">#18485</a>)</li>
<li>Ignore consecutive semicolons in the CSS parser (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18532">#18532</a>)</li>
<li>Center the dropdown icon added to an input with a paired datalist by
default (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18511">#18511</a>)</li>
<li>Extract candidates in Slang templates (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18565">#18565</a>)</li>
<li>Improve error messages when encountering invalid functional utility
names (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18568">#18568</a>)</li>
<li>Discard CSS AST objects with <code>false</code> or
<code>undefined</code> properties (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18571">#18571</a>)</li>
<li>Allow users to disable URL rebasing in
<code>@tailwindcss/postcss</code> via <code>transformAssetUrls:
false</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18321">#18321</a>)</li>
<li>Fix false-positive migrations in <code>addEventListener</code> and
JavaScript variable names (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18718">#18718</a>)</li>
<li>Fix Standalone CLI showing default Bun help when run via symlink on
Windows (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18723">#18723</a>)</li>
<li>Read from <code>--border-color-*</code> theme keys in
<code>divide-*</code> utilities for backwards compatibility (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18704/">#18704</a>)</li>
<li>Don't scan <code>.hdr</code> and <code>.exr</code> files for classes
by default (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18734">#18734</a>)</li>
</ul>
<h2>[4.1.11] - 2025-06-26</h2>
<h3>Fixed</h3>
<ul>
<li>Add heuristic to skip candidate migrations inside
<code>emit(…)</code> (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18330">#18330</a>)</li>
<li>Extract candidates with variants in Clojure/ClojureScript keywords
(<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18338">#18338</a>)</li>
<li>Document <code>--watch=always</code> in the CLI's usage (<a
href="https://redirect.github.com/tailwindlabs/tailwindcss/pull/18337">#18337</a>)</li>
</ul>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/tailwindlabs/tailwindcss/commit/1334c99db8fd26c8ea065375dd9259800863f072"><code>1334c99</code></a>
Prepare v4.1.13 release (<a
href="https://github.com/tailwindlabs/tailwindcss/tree/HEAD/packages/tailwindcss/issues/18868">#18868</a>)</li>
<li><a
href="https://github.com/tailwindlabs/tailwindcss/commit/65dc530f0575da14f8258af7feb5bb687e170904"><code>65dc530</code></a>
Do not allow variants to end with <code>-</code> or <code>_</code> (<a
href="https://github.com/tailwindlabs/tailwindcss/tree/HEAD/packages/tailwindcss/issues/18872">#18872</a>)</li>
<li><a
href="https://github.com/tailwindlabs/tailwindcss/commit/54c3f308e912a5b4107cd70625319da4d28ac51f"><code>54c3f30</code></a>
Do not allow variants to start with <code>-</code> (<a
href="https://github.com/tailwindlabs/tailwindcss/tree/HEAD/packages/tailwindcss/issues/18867">#18867</a>)</li>
<li><a
href="https://github.com/tailwindlabs/tailwindcss/commit/494051ca087aa0dc7e877af52f75601a1308a114"><code>494051c</code></a>
Consider variants starting with <code>@-</code> to be invalid (e.g.
<code>@-2xl:flex</code>) (<a
href="https://github.com/tailwindlabs/tailwindcss/tree/HEAD/packages/tailwindcss/issues/18869">#18869</a>)</li>
<li><a
href="https://github.com/tailwindlabs/tailwindcss/commit/c318329a1ee05d62b67cb681a7d6f54ee5e1bb65"><code>c318329</code></a>
chore: remove redundant words (<a
href="https://github.com/tailwindlabs/tailwindcss/tree/HEAD/packages/tailwindcss/issues/18853">#18853</a>)</li>
<li><a
href="https://github.com/tailwindlabs/tailwindcss/commit/ddc84b079b1d543d3297f30312c9c1fe1a66987c"><code>ddc84b0</code></a>
update test after prettier change</li>
<li><a
href="https://github.com/tailwindlabs/tailwindcss/commit/f1331a857a80147f0c63393d335d034de1d4c374"><code>f1331a8</code></a>
run prettier</li>
<li><a
href="https://github.com/tailwindlabs/tailwindcss/commit/e5513b6c75c9e830ced45e939f2a2b855440ede2"><code>e5513b6</code></a>
Fix missing code block delimiters in comment blocks (<a
href="https://github.com/tailwindlabs/tailwindcss/tree/HEAD/packages/tailwindcss/issues/18837">#18837</a>)</li>
<li><a
href="https://github.com/tailwindlabs/tailwindcss/commit/5e2a160d8b1a6be12a54765a1b8ada26960d8b89"><code>5e2a160</code></a>
Drop exact duplicate declarations from output CSS within a style rule
(<a
href="https://github.com/tailwindlabs/tailwindcss/tree/HEAD/packages/tailwindcss/issues/18809">#18809</a>)</li>
<li><a
href="https://github.com/tailwindlabs/tailwindcss/commit/b1fb02a2d7c01c2b7c1b08e7d1838380a95081d7"><code>b1fb02a</code></a>
Hide internal fields from completions in <code>matchUtilities</code> (<a
href="https://github.com/tailwindlabs/tailwindcss/tree/HEAD/packages/tailwindcss/issues/18820">#18820</a>)</li>
<li>Additional commits viewable in <a
href="https://github.com/tailwindlabs/tailwindcss/commits/v4.1.13/packages/tailwindcss">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=tailwindcss&package-manager=npm_and_yarn&previous-version=4.1.6&new-version=4.1.13)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 llama_stack/ui/package-lock.json | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/llama_stack/ui/package-lock.json b/llama_stack/ui/package-lock.json
index 1db1c61cd..e2c0815fd 100644
--- a/llama_stack/ui/package-lock.json
+++ b/llama_stack/ui/package-lock.json
@@ -3578,6 +3578,13 @@
         "tailwindcss": "4.1.6"
       }
     },
+    "node_modules/@tailwindcss/node/node_modules/tailwindcss": {
+      "version": "4.1.6",
+      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz",
+      "integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/@tailwindcss/oxide": {
       "version": "4.1.6",
       "resolved": "https://registry.npmjs.org/@tailwindcss/oxide/-/oxide-4.1.6.tgz",
@@ -3838,6 +3845,13 @@
         "tailwindcss": "4.1.6"
       }
     },
+    "node_modules/@tailwindcss/postcss/node_modules/tailwindcss": {
+      "version": "4.1.6",
+      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz",
+      "integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/@testing-library/dom": {
       "version": "10.4.1",
       "resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.4.1.tgz",
@@ -13843,9 +13857,9 @@
       }
     },
     "node_modules/tailwindcss": {
-      "version": "4.1.6",
-      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz",
-      "integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==",
+      "version": "4.1.13",
+      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.13.tgz",
+      "integrity": "sha512-i+zidfmTqtwquj4hMEwdjshYYgMbOrPzb9a0M3ZgNa0JMoZeFC6bxZvO8yr8ozS6ix2SDz0+mvryPeBs2TFE+w==",
       "dev": true,
       "license": "MIT"
     },

From d2f88a10fb0cf366708ec106696c812b8c85629c Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Wed, 10 Sep 2025 13:19:36 -0700
Subject: [PATCH 11/30] chore: telemetry test (#3405)

# What does this PR do?
- removed fixed-duration sleeps

## Test Plan
---
 .../telemetry/test_openai_telemetry.py          | 17 ++++++++---------
 tests/integration/telemetry/test_telemetry.py   |  5 +----
 .../telemetry/test_telemetry_metrics.py         |  5 +----
 3 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/tests/integration/telemetry/test_openai_telemetry.py b/tests/integration/telemetry/test_openai_telemetry.py
index cdd9b6702..b3ffb6b09 100644
--- a/tests/integration/telemetry/test_openai_telemetry.py
+++ b/tests/integration/telemetry/test_openai_telemetry.py
@@ -49,16 +49,13 @@ def setup_openai_telemetry_data(llama_stack_client, text_model_id):
         traces = llama_stack_client.telemetry.query_traces(limit=10)
         if len(traces) >= 5:  # 5 OpenAI completion traces
             break
-        time.sleep(1)
+        time.sleep(0.1)
 
     if len(traces) < 5:
         pytest.fail(
             f"Failed to create sufficient OpenAI completion telemetry data after 30s. Got {len(traces)} traces."
         )
 
-    # Wait for 5 seconds to ensure traces has completed logging
-    time.sleep(5)
-
     yield
 
 
@@ -185,11 +182,13 @@ def test_openai_completion_creates_telemetry(llama_stack_client, text_model_id):
     assert len(response.choices) > 0, "Response should have at least one choice"
 
     # Wait for telemetry to be recorded
-    time.sleep(3)
-
-    # Check that we have more traces now
-    final_traces = llama_stack_client.telemetry.query_traces(limit=20)
-    final_count = len(final_traces)
+    start_time = time.time()
+    while time.time() - start_time < 30:
+        final_traces = llama_stack_client.telemetry.query_traces(limit=20)
+        final_count = len(final_traces)
+        if final_count > initial_count:
+            break
+        time.sleep(0.1)
 
     # Should have at least as many traces as before (might have more due to other activity)
     assert final_count >= initial_count, "Should have at least as many traces after OpenAI call"
diff --git a/tests/integration/telemetry/test_telemetry.py b/tests/integration/telemetry/test_telemetry.py
index d363edbc0..e86da954e 100644
--- a/tests/integration/telemetry/test_telemetry.py
+++ b/tests/integration/telemetry/test_telemetry.py
@@ -42,14 +42,11 @@ def setup_telemetry_data(llama_stack_client, text_model_id):
         traces = llama_stack_client.telemetry.query_traces(limit=10)
         if len(traces) >= 4:
             break
-        time.sleep(1)
+        time.sleep(0.1)
 
     if len(traces) < 4:
         pytest.fail(f"Failed to create sufficient telemetry data after 30s. Got {len(traces)} traces.")
 
-    # Wait for 5 seconds to ensure traces has completed logging
-    time.sleep(5)
-
     yield
 
 
diff --git a/tests/integration/telemetry/test_telemetry_metrics.py b/tests/integration/telemetry/test_telemetry_metrics.py
index 4ba2bd2d9..1d8312ae2 100644
--- a/tests/integration/telemetry/test_telemetry_metrics.py
+++ b/tests/integration/telemetry/test_telemetry_metrics.py
@@ -46,10 +46,7 @@ def setup_telemetry_metrics_data(openai_client, client_with_models, text_model_i
                 break
         except Exception:
             pass
-        time.sleep(1)
-
-    # Wait additional time to ensure all metrics are processed
-    time.sleep(5)
+        time.sleep(0.1)
 
     # Return the token lists for use in tests
     return {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": total_tokens}

From c04f1c1e8c0b8c9df80ab51ce7379476cf218317 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Wed, 10 Sep 2025 13:19:44 -0700
Subject: [PATCH 12/30] chore: move benchmarking related code (#3406)

# What does this PR do?
- moving things and some formatting changes


## Test Plan
---
 .../k8s-benchmark/README.md                   |   4 +-
 .../k8s-benchmark/apply.sh                    |   0
 .../k8s-benchmark/benchmark.py                | 129 +++++++------
 .../k8s-benchmark/openai-mock-server.py       | 170 ++++++++++--------
 .../k8s-benchmark/profile_running_server.sh   |   0
 .../k8s-benchmark/run-benchmark.sh            |   0
 .../k8s-benchmark/stack-configmap.yaml        |   0
 .../k8s-benchmark/stack-k8s.yaml.template     |   0
 .../k8s-benchmark/stack_run_config.yaml       |   0
 docs/source/contributing/index.md             |   2 +-
 10 files changed, 156 insertions(+), 149 deletions(-)
 rename {docs/source/distributions => benchmarking}/k8s-benchmark/README.md (98%)
 rename {docs/source/distributions => benchmarking}/k8s-benchmark/apply.sh (100%)
 rename {docs/source/distributions => benchmarking}/k8s-benchmark/benchmark.py (80%)
 rename {docs/source/distributions => benchmarking}/k8s-benchmark/openai-mock-server.py (60%)
 rename {docs/source/distributions => benchmarking}/k8s-benchmark/profile_running_server.sh (100%)
 rename {docs/source/distributions => benchmarking}/k8s-benchmark/run-benchmark.sh (100%)
 rename {docs/source/distributions => benchmarking}/k8s-benchmark/stack-configmap.yaml (100%)
 rename {docs/source/distributions => benchmarking}/k8s-benchmark/stack-k8s.yaml.template (100%)
 rename {docs/source/distributions => benchmarking}/k8s-benchmark/stack_run_config.yaml (100%)

diff --git a/docs/source/distributions/k8s-benchmark/README.md b/benchmarking/k8s-benchmark/README.md
similarity index 98%
rename from docs/source/distributions/k8s-benchmark/README.md
rename to benchmarking/k8s-benchmark/README.md
index 42da4d466..3b0d0c4db 100644
--- a/docs/source/distributions/k8s-benchmark/README.md
+++ b/benchmarking/k8s-benchmark/README.md
@@ -34,13 +34,12 @@ This data enables data-driven architectural decisions and performance optimizati
 
 **1. Deploy base k8s infrastructure:**
 ```bash
-cd ../k8s
+cd ../../docs/source/distributions/k8s
 ./apply.sh
 ```
 
 **2. Deploy benchmark components:**
 ```bash
-cd ../k8s-benchmark
 ./apply.sh
 ```
 
@@ -56,7 +55,6 @@ kubectl get pods
 
 **Benchmark Llama Stack (default):**
 ```bash
-cd docs/source/distributions/k8s-benchmark/
 ./run-benchmark.sh
 ```
 
diff --git a/docs/source/distributions/k8s-benchmark/apply.sh b/benchmarking/k8s-benchmark/apply.sh
similarity index 100%
rename from docs/source/distributions/k8s-benchmark/apply.sh
rename to benchmarking/k8s-benchmark/apply.sh
diff --git a/docs/source/distributions/k8s-benchmark/benchmark.py b/benchmarking/k8s-benchmark/benchmark.py
similarity index 80%
rename from docs/source/distributions/k8s-benchmark/benchmark.py
rename to benchmarking/k8s-benchmark/benchmark.py
index 83ba9602a..d5e34aa23 100644
--- a/docs/source/distributions/k8s-benchmark/benchmark.py
+++ b/benchmarking/k8s-benchmark/benchmark.py
@@ -14,7 +14,7 @@ import os
 import random
 import statistics
 import time
-from typing import Tuple
+
 import aiohttp
 
 
@@ -55,50 +55,50 @@ class BenchmarkStats:
 
         total_time = self.end_time - self.start_time
         success_rate = (self.success_count / self.total_requests) * 100
-        
-        print(f"\n{'='*60}")
-        print(f"BENCHMARK RESULTS")
-        
-        print(f"\nResponse Time Statistics:")
+
+        print(f"\n{'=' * 60}")
+        print("BENCHMARK RESULTS")
+
+        print("\nResponse Time Statistics:")
         print(f"  Mean: {statistics.mean(self.response_times):.3f}s")
         print(f"  Median: {statistics.median(self.response_times):.3f}s")
         print(f"  Min: {min(self.response_times):.3f}s")
         print(f"  Max: {max(self.response_times):.3f}s")
-        
+
         if len(self.response_times) > 1:
             print(f"  Std Dev: {statistics.stdev(self.response_times):.3f}s")
-            
+
         percentiles = [50, 90, 95, 99]
         sorted_times = sorted(self.response_times)
-        print(f"\nPercentiles:")
+        print("\nPercentiles:")
         for p in percentiles:
             idx = int(len(sorted_times) * p / 100) - 1
             idx = max(0, min(idx, len(sorted_times) - 1))
             print(f"  P{p}: {sorted_times[idx]:.3f}s")
-            
+
         if self.ttft_times:
-            print(f"\nTime to First Token (TTFT) Statistics:")
+            print("\nTime to First Token (TTFT) Statistics:")
             print(f"  Mean: {statistics.mean(self.ttft_times):.3f}s")
             print(f"  Median: {statistics.median(self.ttft_times):.3f}s")
             print(f"  Min: {min(self.ttft_times):.3f}s")
             print(f"  Max: {max(self.ttft_times):.3f}s")
-            
+
             if len(self.ttft_times) > 1:
                 print(f"  Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
-                
+
             sorted_ttft = sorted(self.ttft_times)
-            print(f"\nTTFT Percentiles:")
+            print("\nTTFT Percentiles:")
             for p in percentiles:
                 idx = int(len(sorted_ttft) * p / 100) - 1
                 idx = max(0, min(idx, len(sorted_ttft) - 1))
                 print(f"  P{p}: {sorted_ttft[idx]:.3f}s")
-            
+
         if self.chunks_received:
-            print(f"\nStreaming Statistics:")
+            print("\nStreaming Statistics:")
             print(f"  Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
             print(f"  Total chunks received: {sum(self.chunks_received)}")
-        
-        print(f"{'='*60}")
+
+        print(f"{'=' * 60}")
         print(f"Total time: {total_time:.2f}s")
         print(f"Concurrent users: {self.concurrent_users}")
         print(f"Total requests: {self.total_requests}")
@@ -106,16 +106,16 @@ class BenchmarkStats:
         print(f"Failed requests: {len(self.errors)}")
         print(f"Success rate: {success_rate:.1f}%")
         print(f"Requests per second: {self.success_count / total_time:.2f}")
-        
+
         if self.errors:
-            print(f"\nErrors (showing first 5):")
+            print("\nErrors (showing first 5):")
             for error in self.errors[:5]:
                 print(f"  {error}")
 
 
 class LlamaStackBenchmark:
     def __init__(self, base_url: str, model_id: str):
-        self.base_url = base_url.rstrip('/')
+        self.base_url = base_url.rstrip("/")
         self.model_id = model_id
         self.headers = {"Content-Type": "application/json"}
         self.test_messages = [
@@ -126,74 +126,67 @@ class LlamaStackBenchmark:
             [
                 {"role": "user", "content": "What is machine learning?"},
                 {"role": "assistant", "content": "Machine learning is a subset of AI..."},
-                {"role": "user", "content": "Can you give me a practical example?"}
-            ]
+                {"role": "user", "content": "Can you give me a practical example?"},
+            ],
         ]
 
-
-    async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
+    async def make_async_streaming_request(self) -> tuple[float, int, float | None, str | None]:
         """Make a single async streaming chat completion request."""
         messages = random.choice(self.test_messages)
-        payload = {
-            "model": self.model_id,
-            "messages": messages,
-            "stream": True,
-            "max_tokens": 100
-        }
-        
+        payload = {"model": self.model_id, "messages": messages, "stream": True, "max_tokens": 100}
+
         start_time = time.time()
         chunks_received = 0
         ttft = None
         error = None
-        
+
         session = aiohttp.ClientSession()
-        
+
         try:
             async with session.post(
                 f"{self.base_url}/chat/completions",
                 headers=self.headers,
                 json=payload,
-                timeout=aiohttp.ClientTimeout(total=30)
+                timeout=aiohttp.ClientTimeout(total=30),
             ) as response:
                 if response.status == 200:
                     async for line in response.content:
                         if line:
-                            line_str = line.decode('utf-8').strip()
-                            if line_str.startswith('data: '):
+                            line_str = line.decode("utf-8").strip()
+                            if line_str.startswith("data: "):
                                 chunks_received += 1
                                 if ttft is None:
                                     ttft = time.time() - start_time
-                                if line_str == 'data: [DONE]':
+                                if line_str == "data: [DONE]":
                                     break
-                    
+
                     if chunks_received == 0:
                         error = "No streaming chunks received"
                 else:
                     text = await response.text()
                     error = f"HTTP {response.status}: {text[:100]}"
-                    
+
         except Exception as e:
             error = f"Request error: {str(e)}"
         finally:
             await session.close()
-            
+
         response_time = time.time() - start_time
         return response_time, chunks_received, ttft, error
 
-
     async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
         """Run benchmark using async requests for specified duration."""
         stats = BenchmarkStats()
         stats.concurrent_users = concurrent_users
         stats.start_time = time.time()
-        
+
         print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users")
         print(f"Target URL: {self.base_url}/chat/completions")
         print(f"Model: {self.model_id}")
-        
+
         connector = aiohttp.TCPConnector(limit=concurrent_users)
-        async with aiohttp.ClientSession(connector=connector) as session:
-            
+        async with aiohttp.ClientSession(connector=connector):
+
             async def worker(worker_id: int):
                 """Worker that sends requests sequentially until canceled."""
                 request_count = 0
@@ -202,12 +195,12 @@ class LlamaStackBenchmark:
                         response_time, chunks, ttft, error = await self.make_async_streaming_request()
                         await stats.add_result(response_time, chunks, ttft, error)
                         request_count += 1
-                        
+
                     except asyncio.CancelledError:
                         break
                     except Exception as e:
                         await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}")
-            
+
             # Progress reporting task
             async def progress_reporter():
                 last_report_time = time.time()
@@ -216,48 +209,52 @@ class LlamaStackBenchmark:
                         await asyncio.sleep(1)  # Report every second
                         if time.time() >= last_report_time + 10:  # Report every 10 seconds
                             elapsed = time.time() - stats.start_time
-                            print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}")
+                            print(
+                                f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}"
+                            )
                             last_report_time = time.time()
                     except asyncio.CancelledError:
                         break
-            
+
             # Spawn concurrent workers
             tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)]
             progress_task = asyncio.create_task(progress_reporter())
             tasks.append(progress_task)
-            
+
             # Wait for duration then cancel all tasks
             await asyncio.sleep(duration)
-            
+
             for task in tasks:
                 task.cancel()
-            
+
             # Wait for all tasks to complete
             await asyncio.gather(*tasks, return_exceptions=True)
-        
+
         stats.end_time = time.time()
         return stats
 
 
 def main():
     parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
-    parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
-                       help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)")
-    parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"),
-                       help="Model ID to use for requests")
-    parser.add_argument("--duration", type=int, default=60,
-                       help="Duration in seconds to run benchmark (default: 60)")
-    parser.add_argument("--concurrent", type=int, default=10,
-                       help="Number of concurrent users (default: 10)")
-    
+    parser.add_argument(
+        "--base-url",
+        default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
+        help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)",
+    )
+    parser.add_argument(
+        "--model", default=os.getenv("INFERENCE_MODEL", "test-model"), help="Model ID to use for requests"
+    )
+    parser.add_argument("--duration", type=int, default=60, help="Duration in seconds to run benchmark (default: 60)")
+    parser.add_argument("--concurrent", type=int, default=10, help="Number of concurrent users (default: 10)")
+
     args = parser.parse_args()
-    
+
     benchmark = LlamaStackBenchmark(args.base_url, args.model)
-    
+
     try:
         stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent))
         stats.print_summary()
-        
+
     except KeyboardInterrupt:
         print("\nBenchmark interrupted by user")
     except Exception as e:
diff --git a/docs/source/distributions/k8s-benchmark/openai-mock-server.py b/benchmarking/k8s-benchmark/openai-mock-server.py
similarity index 60%
rename from docs/source/distributions/k8s-benchmark/openai-mock-server.py
rename to benchmarking/k8s-benchmark/openai-mock-server.py
index de0680842..9e898af8e 100755
--- a/docs/source/distributions/k8s-benchmark/openai-mock-server.py
+++ b/benchmarking/k8s-benchmark/openai-mock-server.py
@@ -11,180 +11,192 @@ OpenAI-compatible mock server that returns:
 - Valid OpenAI-formatted chat completion responses with dynamic content
 """
 
-from flask import Flask, request, jsonify, Response
-import time
-import random
-import uuid
-import json
 import argparse
+import json
 import os
+import random
+import time
+import uuid
+
+from flask import Flask, Response, jsonify, request
 
 app = Flask(__name__)
 
+
 # Models from environment variables
 def get_models():
     models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
     model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
-    
+
     return {
         "object": "list",
         "data": [
-            {
-                "id": model_id,
-                "object": "model",
-                "created": 1234567890,
-                "owned_by": "vllm"
-            }
-            for model_id in model_ids
-        ]
+            {"id": model_id, "object": "model", "created": 1234567890, "owned_by": "vllm"} for model_id in model_ids
+        ],
     }
 
+
 def generate_random_text(length=50):
     """Generate random but coherent text for responses."""
     words = [
-        "Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you",
-        "with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what",
-        "you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist",
-        "with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more"
+        "Hello",
+        "there",
+        "I'm",
+        "an",
+        "AI",
+        "assistant",
+        "ready",
+        "to",
+        "help",
+        "you",
+        "with",
+        "your",
+        "questions",
+        "and",
+        "tasks",
+        "today",
+        "Let",
+        "me",
+        "know",
+        "what",
+        "you'd",
+        "like",
+        "to",
+        "discuss",
+        "or",
+        "explore",
+        "together",
+        "I",
+        "can",
+        "assist",
+        "with",
+        "various",
+        "topics",
+        "including",
+        "coding",
+        "writing",
+        "analysis",
+        "and",
+        "more",
     ]
     return " ".join(random.choices(words, k=length))
 
-@app.route('/v1/models', methods=['GET'])
+
+@app.route("/v1/models", methods=["GET"])
 def list_models():
     models = get_models()
     print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
     return jsonify(models)
 
-@app.route('/v1/chat/completions', methods=['POST'])
+
+@app.route("/v1/chat/completions", methods=["POST"])
 def chat_completions():
     """Return OpenAI-formatted chat completion responses."""
     data = request.get_json()
-    default_model = get_models()['data'][0]['id']
-    model = data.get('model', default_model)
-    messages = data.get('messages', [])
-    stream = data.get('stream', False)
-     
+    default_model = get_models()["data"][0]["id"]
+    model = data.get("model", default_model)
+    messages = data.get("messages", [])
+    stream = data.get("stream", False)
+
     print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
-    
+
     if stream:
         return handle_streaming_completion(model, messages)
     else:
         return handle_non_streaming_completion(model, messages)
 
+
 def handle_non_streaming_completion(model, messages):
     response_text = generate_random_text(random.randint(20, 80))
-    
+
     # Calculate realistic token counts
-    prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages)
+    prompt_tokens = sum(len(str(msg.get("content", "")).split()) for msg in messages)
     completion_tokens = len(response_text.split())
-    
+
     response = {
         "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
         "object": "chat.completion",
         "created": int(time.time()),
         "model": model,
-        "choices": [
-            {
-                "index": 0,
-                "message": {
-                    "role": "assistant",
-                    "content": response_text
-                },
-                "finish_reason": "stop"
-            }
-        ],
+        "choices": [{"index": 0, "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop"}],
         "usage": {
             "prompt_tokens": prompt_tokens,
             "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens
-        }
+            "total_tokens": prompt_tokens + completion_tokens,
+        },
     }
-    
+
     return jsonify(response)
 
+
 def handle_streaming_completion(model, messages):
     def generate_stream():
         # Generate response text
         full_response = generate_random_text(random.randint(30, 100))
         words = full_response.split()
-        
+
         # Send initial chunk
         initial_chunk = {
             "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
             "object": "chat.completion.chunk",
             "created": int(time.time()),
             "model": model,
-            "choices": [
-                {
-                    "index": 0,
-                    "delta": {"role": "assistant", "content": ""}
-                }
-            ]
+            "choices": [{"index": 0, "delta": {"role": "assistant", "content": ""}}],
         }
         yield f"data: {json.dumps(initial_chunk)}\n\n"
-        
+
         # Send word by word
         for i, word in enumerate(words):
             chunk = {
                 "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
-                "object": "chat.completion.chunk", 
+                "object": "chat.completion.chunk",
                 "created": int(time.time()),
                 "model": model,
-                "choices": [
-                    {
-                        "index": 0,
-                        "delta": {"content": f"{word} " if i < len(words) - 1 else word}
-                    }
-                ]
+                "choices": [{"index": 0, "delta": {"content": f"{word} " if i < len(words) - 1 else word}}],
             }
             yield f"data: {json.dumps(chunk)}\n\n"
             # Configurable delay to simulate realistic streaming
             stream_delay = float(os.getenv("STREAM_DELAY_SECONDS", "0.005"))
             time.sleep(stream_delay)
-        
+
         # Send final chunk
         final_chunk = {
             "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
             "object": "chat.completion.chunk",
             "created": int(time.time()),
             "model": model,
-            "choices": [
-                {
-                    "index": 0,
-                    "delta": {"content": ""},
-                    "finish_reason": "stop"
-                }
-            ]
+            "choices": [{"index": 0, "delta": {"content": ""}, "finish_reason": "stop"}],
         }
         yield f"data: {json.dumps(final_chunk)}\n\n"
         yield "data: [DONE]\n\n"
-    
+
     return Response(
         generate_stream(),
-        mimetype='text/event-stream',
+        mimetype="text/event-stream",
         headers={
-            'Cache-Control': 'no-cache',
-            'Connection': 'keep-alive',
-            'Access-Control-Allow-Origin': '*',
-        }
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "Access-Control-Allow-Origin": "*",
+        },
     )
 
-@app.route('/health', methods=['GET'])
+
+@app.route("/health", methods=["GET"])
 def health():
     return jsonify({"status": "healthy", "type": "openai-mock"})
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='OpenAI-compatible mock server')
-    parser.add_argument('--port', type=int, default=8081, 
-                       help='Port to run the server on (default: 8081)')
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="OpenAI-compatible mock server")
+    parser.add_argument("--port", type=int, default=8081, help="Port to run the server on (default: 8081)")
     args = parser.parse_args()
-    
+
     port = args.port
-    
+
     models = get_models()
     print("Starting OpenAI-compatible mock server...")
     print(f"- /models endpoint with: {[m['id'] for m in models['data']]}")
     print("- OpenAI-formatted chat/completion responses with dynamic content")
     print("- Streaming support with valid SSE format")
     print(f"- Listening on: http://0.0.0.0:{port}")
-    app.run(host='0.0.0.0', port=port, debug=False)
+    app.run(host="0.0.0.0", port=port, debug=False)
diff --git a/docs/source/distributions/k8s-benchmark/profile_running_server.sh b/benchmarking/k8s-benchmark/profile_running_server.sh
similarity index 100%
rename from docs/source/distributions/k8s-benchmark/profile_running_server.sh
rename to benchmarking/k8s-benchmark/profile_running_server.sh
diff --git a/docs/source/distributions/k8s-benchmark/run-benchmark.sh b/benchmarking/k8s-benchmark/run-benchmark.sh
similarity index 100%
rename from docs/source/distributions/k8s-benchmark/run-benchmark.sh
rename to benchmarking/k8s-benchmark/run-benchmark.sh
diff --git a/docs/source/distributions/k8s-benchmark/stack-configmap.yaml b/benchmarking/k8s-benchmark/stack-configmap.yaml
similarity index 100%
rename from docs/source/distributions/k8s-benchmark/stack-configmap.yaml
rename to benchmarking/k8s-benchmark/stack-configmap.yaml
diff --git a/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template b/benchmarking/k8s-benchmark/stack-k8s.yaml.template
similarity index 100%
rename from docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template
rename to benchmarking/k8s-benchmark/stack-k8s.yaml.template
diff --git a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml b/benchmarking/k8s-benchmark/stack_run_config.yaml
similarity index 100%
rename from docs/source/distributions/k8s-benchmark/stack_run_config.yaml
rename to benchmarking/k8s-benchmark/stack_run_config.yaml
diff --git a/docs/source/contributing/index.md b/docs/source/contributing/index.md
index 1846f4d97..71c3bd5a6 100644
--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@@ -35,5 +35,5 @@ testing/record-replay
 
 ### Benchmarking
 
-```{include} ../../../docs/source/distributions/k8s-benchmark/README.md
+```{include} ../../../benchmarking/k8s-benchmark/README.md
 ```

From 0c7f49490cdb6ff757659469d1401b515ac4402c Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Wed, 10 Sep 2025 14:34:18 -0700
Subject: [PATCH 13/30] fix(inference_store): on duplicate chat completion IDs,
 replace (#3408)

# What does this PR do?

Duplicate chat completion IDs can be generated during tests especially
if they are replaying recorded responses across different tests. No need
to warn or error under those circumstances. In the wild, this is not
likely to happen at all (no evidence) so we aren't really hiding any
problem.
---
 .../utils/inference/inference_store.py        | 48 +++++++++++++++----
 .../utils/sqlstore/authorized_sqlstore.py     | 14 ++++++
 2 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/llama_stack/providers/utils/inference/inference_store.py b/llama_stack/providers/utils/inference/inference_store.py
index 8c69b1683..17f4c6268 100644
--- a/llama_stack/providers/utils/inference/inference_store.py
+++ b/llama_stack/providers/utils/inference/inference_store.py
@@ -6,6 +6,8 @@
 import asyncio
 from typing import Any
 
+from sqlalchemy.exc import IntegrityError
+
 from llama_stack.apis.inference import (
     ListOpenAIChatCompletionResponse,
     OpenAIChatCompletion,
@@ -129,16 +131,44 @@ class InferenceStore:
             raise ValueError("Inference store is not initialized")
 
         data = chat_completion.model_dump()
+        record_data = {
+            "id": data["id"],
+            "created": data["created"],
+            "model": data["model"],
+            "choices": data["choices"],
+            "input_messages": [message.model_dump() for message in input_messages],
+        }
 
-        await self.sql_store.insert(
-            table="chat_completions",
-            data={
-                "id": data["id"],
-                "created": data["created"],
-                "model": data["model"],
-                "choices": data["choices"],
-                "input_messages": [message.model_dump() for message in input_messages],
-            },
+        try:
+            await self.sql_store.insert(
+                table="chat_completions",
+                data=record_data,
+            )
+        except IntegrityError as e:
+            # Duplicate chat completion IDs can be generated during tests especially if they are replaying
+            # recorded responses across different tests. No need to warn or error under those circumstances.
+            # In the wild, this is not likely to happen at all (no evidence) so we aren't really hiding any problem.
+
+            # Check if it's a unique constraint violation
+            error_message = str(e.orig) if e.orig else str(e)
+            if self._is_unique_constraint_error(error_message):
+                # Update the existing record instead
+                await self.sql_store.update(table="chat_completions", data=record_data, where={"id": data["id"]})
+            else:
+                # Re-raise if it's not a unique constraint error
+                raise
+
+    def _is_unique_constraint_error(self, error_message: str) -> bool:
+        """Check if the error is specifically a unique constraint violation."""
+        error_lower = error_message.lower()
+        return any(
+            indicator in error_lower
+            for indicator in [
+                "unique constraint failed",  # SQLite
+                "duplicate key",  # PostgreSQL
+                "unique violation",  # PostgreSQL alternative
+                "duplicate entry",  # MySQL
+            ]
         )
 
     async def list_chat_completions(
diff --git a/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py b/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py
index 867ba2f55..acb688f96 100644
--- a/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py
+++ b/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py
@@ -172,6 +172,20 @@ class AuthorizedSqlStore:
 
         return results.data[0] if results.data else None
 
+    async def update(self, table: str, data: Mapping[str, Any], where: Mapping[str, Any]) -> None:
+        """Update rows with automatic access control attribute capture."""
+        enhanced_data = dict(data)
+
+        current_user = get_authenticated_user()
+        if current_user:
+            enhanced_data["owner_principal"] = current_user.principal
+            enhanced_data["access_attributes"] = current_user.attributes
+        else:
+            enhanced_data["owner_principal"] = None
+            enhanced_data["access_attributes"] = None
+
+        await self.sql_store.update(table, enhanced_data, where)
+
     async def delete(self, table: str, where: Mapping[str, Any]) -> None:
         """Delete rows with automatic access control filtering."""
         await self.sql_store.delete(table, where)

From 8e05c68d159a40d54768a9473d63b68a5bfbf369 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Thu, 11 Sep 2025 10:19:59 +0200
Subject: [PATCH 14/30] chore: remove openai dependency from providers (#3398)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

The openai package is already a dependency of the llama-stack project
itself, so let's the project dictate which openai version we need and
avoid potential breakage with unsatisfiable dependency resolution.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 llama_stack/providers/registry/batches.py   |  2 +-
 llama_stack/providers/registry/inference.py | 20 ++++++++------------
 llama_stack/providers/registry/scoring.py   |  2 +-
 pyproject.toml                              |  4 +---
 uv.lock                                     |  8 ++------
 5 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/llama_stack/providers/registry/batches.py b/llama_stack/providers/registry/batches.py
index de7886efb..a07942486 100644
--- a/llama_stack/providers/registry/batches.py
+++ b/llama_stack/providers/registry/batches.py
@@ -13,7 +13,7 @@ def available_providers() -> list[ProviderSpec]:
         InlineProviderSpec(
             api=Api.batches,
             provider_type="inline::reference",
-            pip_packages=["openai"],
+            pip_packages=[],
             module="llama_stack.providers.inline.batches.reference",
             config_class="llama_stack.providers.inline.batches.reference.config.ReferenceBatchesImplConfig",
             api_dependencies=[
diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py
index 541fbb432..8912560cb 100644
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@@ -75,7 +75,7 @@ def available_providers() -> list[ProviderSpec]:
             api=Api.inference,
             adapter=AdapterSpec(
                 adapter_type="vllm",
-                pip_packages=["openai"],
+                pip_packages=[],
                 module="llama_stack.providers.remote.inference.vllm",
                 config_class="llama_stack.providers.remote.inference.vllm.VLLMInferenceAdapterConfig",
                 description="Remote vLLM inference provider for connecting to vLLM servers.",
@@ -151,9 +151,7 @@ def available_providers() -> list[ProviderSpec]:
             api=Api.inference,
             adapter=AdapterSpec(
                 adapter_type="databricks",
-                pip_packages=[
-                    "openai",
-                ],
+                pip_packages=[],
                 module="llama_stack.providers.remote.inference.databricks",
                 config_class="llama_stack.providers.remote.inference.databricks.DatabricksImplConfig",
                 description="Databricks inference provider for running models on Databricks' unified analytics platform.",
@@ -163,9 +161,7 @@ def available_providers() -> list[ProviderSpec]:
             api=Api.inference,
             adapter=AdapterSpec(
                 adapter_type="nvidia",
-                pip_packages=[
-                    "openai",
-                ],
+                pip_packages=[],
                 module="llama_stack.providers.remote.inference.nvidia",
                 config_class="llama_stack.providers.remote.inference.nvidia.NVIDIAConfig",
                 description="NVIDIA inference provider for accessing NVIDIA NIM models and AI services.",
@@ -175,7 +171,7 @@ def available_providers() -> list[ProviderSpec]:
             api=Api.inference,
             adapter=AdapterSpec(
                 adapter_type="runpod",
-                pip_packages=["openai"],
+                pip_packages=[],
                 module="llama_stack.providers.remote.inference.runpod",
                 config_class="llama_stack.providers.remote.inference.runpod.RunpodImplConfig",
                 description="RunPod inference provider for running models on RunPod's cloud GPU platform.",
@@ -207,7 +203,7 @@ def available_providers() -> list[ProviderSpec]:
             api=Api.inference,
             adapter=AdapterSpec(
                 adapter_type="gemini",
-                pip_packages=["litellm", "openai"],
+                pip_packages=["litellm"],
                 module="llama_stack.providers.remote.inference.gemini",
                 config_class="llama_stack.providers.remote.inference.gemini.GeminiConfig",
                 provider_data_validator="llama_stack.providers.remote.inference.gemini.config.GeminiProviderDataValidator",
@@ -218,7 +214,7 @@ def available_providers() -> list[ProviderSpec]:
             api=Api.inference,
             adapter=AdapterSpec(
                 adapter_type="vertexai",
-                pip_packages=["litellm", "google-cloud-aiplatform", "openai"],
+                pip_packages=["litellm", "google-cloud-aiplatform"],
                 module="llama_stack.providers.remote.inference.vertexai",
                 config_class="llama_stack.providers.remote.inference.vertexai.VertexAIConfig",
                 provider_data_validator="llama_stack.providers.remote.inference.vertexai.config.VertexAIProviderDataValidator",
@@ -248,7 +244,7 @@ Available Models:
             api=Api.inference,
             adapter=AdapterSpec(
                 adapter_type="groq",
-                pip_packages=["litellm", "openai"],
+                pip_packages=["litellm"],
                 module="llama_stack.providers.remote.inference.groq",
                 config_class="llama_stack.providers.remote.inference.groq.GroqConfig",
                 provider_data_validator="llama_stack.providers.remote.inference.groq.config.GroqProviderDataValidator",
@@ -270,7 +266,7 @@ Available Models:
             api=Api.inference,
             adapter=AdapterSpec(
                 adapter_type="sambanova",
-                pip_packages=["litellm", "openai"],
+                pip_packages=["litellm"],
                 module="llama_stack.providers.remote.inference.sambanova",
                 config_class="llama_stack.providers.remote.inference.sambanova.SambaNovaImplConfig",
                 provider_data_validator="llama_stack.providers.remote.inference.sambanova.config.SambaNovaProviderDataValidator",
diff --git a/llama_stack/providers/registry/scoring.py b/llama_stack/providers/registry/scoring.py
index 79293d888..a4ec54ed2 100644
--- a/llama_stack/providers/registry/scoring.py
+++ b/llama_stack/providers/registry/scoring.py
@@ -38,7 +38,7 @@ def available_providers() -> list[ProviderSpec]:
         InlineProviderSpec(
             api=Api.scoring,
             provider_type="inline::braintrust",
-            pip_packages=["autoevals", "openai"],
+            pip_packages=["autoevals"],
             module="llama_stack.providers.inline.scoring.braintrust",
             config_class="llama_stack.providers.inline.scoring.braintrust.BraintrustScoringConfig",
             api_dependencies=[
diff --git a/pyproject.toml b/pyproject.toml
index 0414aafb0..72c4f6f9e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ dependencies = [
     "jinja2>=3.1.6",
     "jsonschema",
     "llama-stack-client>=0.2.21",
-    "openai>=1.99.6",
+    "openai>=1.100.0",                                # for expires_after support
     "prompt-toolkit",
     "python-dotenv",
     "python-jose[cryptography]",
@@ -80,7 +80,6 @@ dev = [
 unit = [
     "sqlite-vec",
     "ollama",
-    "openai",
     "aiosqlite",
     "aiohttp",
     "psycopg2-binary>=2.9.0",
@@ -105,7 +104,6 @@ unit = [
 # separately. If you are using "uv" to execute your tests, you can use the "--group" flag to specify extra
 # dependencies.
 test = [
-    "openai>=1.100.0",  # for expires_after support
     "aiosqlite",
     "aiohttp",
     "torch>=2.6.0",
diff --git a/uv.lock b/uv.lock
index 3d7713f54..065eb3876 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 3
+revision = 2
 requires-python = ">=3.12"
 resolution-markers = [
     "(python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')",
@@ -1839,7 +1839,6 @@ test = [
     { name = "datasets" },
     { name = "mcp" },
     { name = "milvus-lite" },
-    { name = "openai" },
     { name = "psycopg2-binary" },
     { name = "pymilvus" },
     { name = "pypdf" },
@@ -1865,7 +1864,6 @@ unit = [
     { name = "milvus-lite" },
     { name = "moto", extra = ["s3"] },
     { name = "ollama" },
-    { name = "openai" },
     { name = "psycopg2-binary" },
     { name = "pymilvus" },
     { name = "pypdf" },
@@ -1889,7 +1887,7 @@ requires-dist = [
     { name = "jsonschema" },
     { name = "llama-stack-client", specifier = ">=0.2.21" },
     { name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.21" },
-    { name = "openai", specifier = ">=1.99.6" },
+    { name = "openai", specifier = ">=1.100.0" },
     { name = "opentelemetry-exporter-otlp-proto-http", specifier = ">=1.30.0" },
     { name = "opentelemetry-sdk", specifier = ">=1.30.0" },
     { name = "pandas", marker = "extra == 'ui'" },
@@ -1959,7 +1957,6 @@ test = [
     { name = "datasets", specifier = ">=4.0.0" },
     { name = "mcp" },
     { name = "milvus-lite", specifier = ">=2.5.0" },
-    { name = "openai", specifier = ">=1.100.0" },
     { name = "psycopg2-binary", specifier = ">=2.9.0" },
     { name = "pymilvus", specifier = ">=2.6.1" },
     { name = "pypdf" },
@@ -1984,7 +1981,6 @@ unit = [
     { name = "milvus-lite", specifier = ">=2.5.0" },
     { name = "moto", extras = ["s3"], specifier = ">=5.1.10" },
     { name = "ollama" },
-    { name = "openai" },
     { name = "psycopg2-binary", specifier = ">=2.9.0" },
     { name = "pymilvus", specifier = ">=2.6.1" },
     { name = "pypdf" },

From 2838d5a20f888c9f8fad666272dd9ca8d3bb4884 Mon Sep 17 00:00:00 2001
From: Sumanth Kamenani <skamenan@redhat.com>
Date: Thu, 11 Sep 2025 05:41:53 -0400
Subject: [PATCH 15/30] fix: AWS Bedrock inference profile ID conversion for
 region-specific endpoints (#3386)

Fixes #3370

AWS switched to requiring region-prefixed inference profile IDs instead
of foundation model IDs for on-demand throughput. This was causing
ValidationException errors.

Added auto-detection based on boto3 client region to convert model IDs
like meta.llama3-1-70b-instruct-v1:0 to
us.meta.llama3-1-70b-instruct-v1:0 depending on the detected region.

Also handles edge cases like ARNs, case insensitive regions, and None
regions.

Tested with this request.
```json
{
  "model_id": "meta.llama3-1-8b-instruct-v1:0",
  "messages": [
    {
      "role": "system",
      "content": "You are a helpful assistant."
    },
    {
      "role": "user",
      "content": "tell me a riddle"
    }
  ],
  "sampling_params": {
     "strategy": {
        "type": "top_p",
        "temperature": 0.7,
        "top_p": 0.9
      },
      "max_tokens": 512
  }
}
```
<img width="1488" height="878" alt="image"
src="https://github.com/user-attachments/assets/0d61beec-3869-4a31-8f37-9f554c280b88"
/>
---
 .../remote/inference/bedrock/bedrock.py       | 51 +++++++++++++++++-
 tests/unit/providers/test_bedrock.py          | 53 +++++++++++++++++++
 2 files changed, 102 insertions(+), 2 deletions(-)
 create mode 100644 tests/unit/providers/test_bedrock.py

diff --git a/llama_stack/providers/remote/inference/bedrock/bedrock.py b/llama_stack/providers/remote/inference/bedrock/bedrock.py
index 63ea196f6..106caed9b 100644
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@@ -53,6 +53,43 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 
 from .models import MODEL_ENTRIES
 
+REGION_PREFIX_MAP = {
+    "us": "us.",
+    "eu": "eu.",
+    "ap": "ap.",
+}
+
+
+def _get_region_prefix(region: str | None) -> str:
+    # AWS requires region prefixes for inference profiles
+    if region is None:
+        return "us."  # default to US when we don't know
+
+    # Handle case insensitive region matching
+    region_lower = region.lower()
+    for prefix in REGION_PREFIX_MAP:
+        if region_lower.startswith(f"{prefix}-"):
+            return REGION_PREFIX_MAP[prefix]
+
+    # Fallback to US for anything we don't recognize
+    return "us."
+
+
+def _to_inference_profile_id(model_id: str, region: str = None) -> str:
+    # Return ARNs unchanged
+    if model_id.startswith("arn:"):
+        return model_id
+
+    # Return inference profile IDs that already have regional prefixes
+    if any(model_id.startswith(p) for p in REGION_PREFIX_MAP.values()):
+        return model_id
+
+    # Default to US East when no region is provided
+    if region is None:
+        region = "us-east-1"
+
+    return _get_region_prefix(region) + model_id
+
 
 class BedrockInferenceAdapter(
     ModelRegistryHelper,
@@ -166,8 +203,13 @@ class BedrockInferenceAdapter(
             options["repetition_penalty"] = sampling_params.repetition_penalty
 
         prompt = await chat_completion_request_to_prompt(request, self.get_llama_model(request.model))
+
+        # Convert foundation model ID to inference profile ID
+        region_name = self.client.meta.region_name
+        inference_profile_id = _to_inference_profile_id(bedrock_model, region_name)
+
         return {
-            "modelId": bedrock_model,
+            "modelId": inference_profile_id,
             "body": json.dumps(
                 {
                     "prompt": prompt,
@@ -185,6 +227,11 @@ class BedrockInferenceAdapter(
         task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         model = await self.model_store.get_model(model_id)
+
+        # Convert foundation model ID to inference profile ID
+        region_name = self.client.meta.region_name
+        inference_profile_id = _to_inference_profile_id(model.provider_resource_id, region_name)
+
         embeddings = []
         for content in contents:
             assert not content_has_media(content), "Bedrock does not support media for embeddings"
@@ -193,7 +240,7 @@ class BedrockInferenceAdapter(
             body = json.dumps(input_body)
             response = self.client.invoke_model(
                 body=body,
-                modelId=model.provider_resource_id,
+                modelId=inference_profile_id,
                 accept="application/json",
                 contentType="application/json",
             )
diff --git a/tests/unit/providers/test_bedrock.py b/tests/unit/providers/test_bedrock.py
new file mode 100644
index 000000000..1ff07bbbe
--- /dev/null
+++ b/tests/unit/providers/test_bedrock.py
@@ -0,0 +1,53 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.providers.remote.inference.bedrock.bedrock import (
+    _get_region_prefix,
+    _to_inference_profile_id,
+)
+
+
+def test_region_prefixes():
+    assert _get_region_prefix("us-east-1") == "us."
+    assert _get_region_prefix("eu-west-1") == "eu."
+    assert _get_region_prefix("ap-south-1") == "ap."
+    assert _get_region_prefix("ca-central-1") == "us."
+
+    # Test case insensitive
+    assert _get_region_prefix("US-EAST-1") == "us."
+    assert _get_region_prefix("EU-WEST-1") == "eu."
+    assert _get_region_prefix("Ap-South-1") == "ap."
+
+    # Test None region
+    assert _get_region_prefix(None) == "us."
+
+
+def test_model_id_conversion():
+    # Basic conversion
+    assert (
+        _to_inference_profile_id("meta.llama3-1-70b-instruct-v1:0", "us-east-1") == "us.meta.llama3-1-70b-instruct-v1:0"
+    )
+
+    # Already has prefix
+    assert (
+        _to_inference_profile_id("us.meta.llama3-1-70b-instruct-v1:0", "us-east-1")
+        == "us.meta.llama3-1-70b-instruct-v1:0"
+    )
+
+    # ARN should be returned unchanged
+    arn = "arn:aws:bedrock:us-east-1:123456789012:inference-profile/us.meta.llama3-1-70b-instruct-v1:0"
+    assert _to_inference_profile_id(arn, "us-east-1") == arn
+
+    # ARN should be returned unchanged even without region
+    assert _to_inference_profile_id(arn) == arn
+
+    # Optional region parameter defaults to us-east-1
+    assert _to_inference_profile_id("meta.llama3-1-70b-instruct-v1:0") == "us.meta.llama3-1-70b-instruct-v1:0"
+
+    # Different regions work with optional parameter
+    assert (
+        _to_inference_profile_id("meta.llama3-1-70b-instruct-v1:0", "eu-west-1") == "eu.meta.llama3-1-70b-instruct-v1:0"
+    )

From c2d281e01b360ba0a2db177b90df6e7ba4df8501 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Thu, 11 Sep 2025 07:48:19 -0400
Subject: [PATCH 16/30] chore(replay): improve replay robustness with
 un-validated construction (#3414)

# What does this PR do?

some providers do not produce spec compliant outputs. when this happens
the replay infra will fail to construct the proper types and will return
a dict to the client. the client likely does not expect a dict.

this was discovered with tgi, which returns finish_reason="" when valid
values are "stop", "length" or "content_filter"

## Test Plan

ci
---
 llama_stack/testing/inference_recorder.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/llama_stack/testing/inference_recorder.py b/llama_stack/testing/inference_recorder.py
index 298758c92..e78f493a6 100644
--- a/llama_stack/testing/inference_recorder.py
+++ b/llama_stack/testing/inference_recorder.py
@@ -105,8 +105,12 @@ def _deserialize_response(data: dict[str, Any]) -> Any:
 
             return cls.model_validate(data["__data__"])
         except (ImportError, AttributeError, TypeError, ValueError) as e:
-            logger.warning(f"Failed to deserialize object of type {data['__type__']}: {e}")
-            return data["__data__"]
+            logger.warning(f"Failed to deserialize object of type {data['__type__']} with model_validate: {e}")
+            try:
+                return cls.model_construct(**data["__data__"])
+            except Exception as e:
+                logger.warning(f"Failed to deserialize object of type {data['__type__']} with model_construct: {e}")
+                return data["__data__"]
 
     return data
 

From f31bcc11bc9e4a88ce82dadafea8d4b0cb5f7230 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Thu, 11 Sep 2025 13:48:38 +0200
Subject: [PATCH 17/30] feat: add Azure OpenAI inference provider support
 (#3396)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Llama-stack now supports a new OpenAI compatible endpoint with Azure
OpenAI. The starter distro has been updated to add the new remote
inference provider.

A few tests have been modified and improved.

## Test Plan

Deploy a model in the Aure portal then:

```
$ AZURE_API_KEY=... AZURE_API_BASE=... uv run llama stack build --image-type venv --providers inference=remote::azure --run
...
$ LLAMA_STACK_CONFIG=http://localhost:8321 uv run --group test pytest -v -ra --text-model azure/gpt-4.1 tests/integration/inference/test_openai_completion.py
...

Results:

```
============================================= test session starts
============================================== platform darwin -- Python
3.12.8, pytest-8.4.1, pluggy-1.6.0 --
/Users/leseb/Documents/AI/llama-stack/.venv/bin/python3 cachedir:
.pytest_cache
metadata: {'Python': '3.12.8', 'Platform':
'macOS-15.6.1-arm64-arm-64bit', 'Packages': {'pytest': '8.4.1',
'pluggy': '1.6.0'}, 'Plugins': {'anyio': '4.9.0', 'html': '4.1.1',
'socket': '0.7.0', 'asyncio': '1.1.0', 'json-report': '1.5.0',
'timeout': '2.4.0', 'metadata': '3.1.1', 'cov': '6.2.1', 'nbval':
'0.11.0', 'hydra-core': '1.3.2'}} rootdir:
/Users/leseb/Documents/AI/llama-stack
configfile: pyproject.toml
plugins: anyio-4.9.0, html-4.1.1, socket-0.7.0, asyncio-1.1.0,
json-report-1.5.0, timeout-2.4.0, metadata-3.1.1, cov-6.2.1,
nbval-0.11.0, hydra-core-1.3.2 asyncio: mode=Mode.AUTO,
asyncio_default_fixture_loop_scope=None,
asyncio_default_test_loop_scope=function collected 27 items


tests/integration/inference/test_openai_completion.py::test_openai_completion_non_streaming[txt=azure/gpt-5-mini-inference:completion:sanity]
SKIPPED [ 3%]
tests/integration/inference/test_openai_completion.py::test_openai_completion_non_streaming_suffix[txt=azure/gpt-5-mini-inference:completion:suffix]
SKIPPED [ 7%]
tests/integration/inference/test_openai_completion.py::test_openai_completion_streaming[txt=azure/gpt-5-mini-inference:completion:sanity]
SKIPPED [ 11%]
tests/integration/inference/test_openai_completion.py::test_openai_completion_prompt_logprobs[txt=azure/gpt-5-mini-1]
SKIPPED [ 14%]
tests/integration/inference/test_openai_completion.py::test_openai_completion_guided_choice[txt=azure/gpt-5-mini]
SKIPPED [ 18%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_01]
PASSED [ 22%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01]
PASSED [ 25%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01]
PASSED [ 29%]
tests/integration/inference/test_openai_completion.py::test_inference_store[openai_client-txt=azure/gpt-5-mini-True]
PASSED [ 33%]
tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=azure/gpt-5-mini-True]
PASSED [ 37%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming_with_file[txt=azure/gpt-5-mini]
SKIPPEDed files.) [ 40%]
tests/integration/inference/test_openai_completion.py::test_openai_completion_prompt_logprobs[txt=azure/gpt-5-mini-0]
SKIPPED [ 44%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_02]
PASSED [ 48%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02]
PASSED [ 51%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02]
PASSED [ 55%]
tests/integration/inference/test_openai_completion.py::test_inference_store[openai_client-txt=azure/gpt-5-mini-False]
PASSED [ 59%]
tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=azure/gpt-5-mini-False]
PASSED [ 62%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_01]
PASSED [ 66%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01]
PASSED [ 70%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01]
PASSED [ 74%]
tests/integration/inference/test_openai_completion.py::test_inference_store[client_with_models-txt=azure/gpt-5-mini-True]
PASSED [ 77%]
tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=azure/gpt-5-mini-True]
PASSED [ 81%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_02]
PASSED [ 85%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02]
PASSED [ 88%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02]
PASSED [ 92%]
tests/integration/inference/test_openai_completion.py::test_inference_store[client_with_models-txt=azure/gpt-5-mini-False]
PASSED [ 96%]
tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=azure/gpt-5-mini-False]
PASSED [100%]

=========================================== short test summary info
============================================ SKIPPED [3]
tests/integration/inference/test_openai_completion.py:63: Model
azure/gpt-5-mini hosted by remote::azure doesn't support OpenAI
completions. SKIPPED [3]
tests/integration/inference/test_openai_completion.py:118: Model
azure/gpt-5-mini hosted by remote::azure doesn't support vllm extra_body
parameters. SKIPPED [1]
tests/integration/inference/test_openai_completion.py:124: Model
azure/gpt-5-mini hosted by remote::azure doesn't support chat completion
calls with base64 encoded files. ================================== 20
passed, 7 skipped, 2 warnings in 51.77s
==================================
```

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 docs/source/providers/inference/index.md      |    1 +
 .../providers/inference/remote_azure.md       |   29 +
 llama_stack/distributions/ci-tests/build.yaml |    1 +
 llama_stack/distributions/ci-tests/run.yaml   |    7 +
 .../distributions/starter-gpu/build.yaml      |    1 +
 .../distributions/starter-gpu/run.yaml        |    7 +
 llama_stack/distributions/starter/build.yaml  |    1 +
 llama_stack/distributions/starter/run.yaml    |    7 +
 llama_stack/distributions/starter/starter.py  |   18 +
 llama_stack/providers/registry/inference.py   |   15 +
 .../remote/inference/azure/__init__.py        |   15 +
 .../providers/remote/inference/azure/azure.py |   64 +
 .../remote/inference/azure/config.py          |   63 +
 .../remote/inference/azure/models.py          |   28 +
 .../inference/test_openai_completion.py       |   53 +-
 .../inference/test_text_inference.py          |    3 +-
 .../recordings/responses/0fda25b9241c.json    |   71 +
 .../recordings/responses/2b2ad549510d.json    |  448 ++++
 .../recordings/responses/57b67d1b1a36.json    |   71 +
 .../recordings/responses/8752115f8d0c.json    |   71 +
 .../recordings/responses/94d11daee205.json    | 1178 +++++++++
 .../recordings/responses/9f3d749cc1c8.json    | 1150 +++++++++
 .../recordings/responses/c791119e6359.json    |   98 +
 .../recordings/responses/d3e27b7234e2.json    | 2150 +++++++++++++++++
 .../recordings/responses/fb785db7fafd.json    |  310 +++
 .../recordings/responses/ff3271401fb4.json    |  556 +++++
 26 files changed, 6403 insertions(+), 13 deletions(-)
 create mode 100644 docs/source/providers/inference/remote_azure.md
 create mode 100644 llama_stack/providers/remote/inference/azure/__init__.py
 create mode 100644 llama_stack/providers/remote/inference/azure/azure.py
 create mode 100644 llama_stack/providers/remote/inference/azure/config.py
 create mode 100644 llama_stack/providers/remote/inference/azure/models.py
 create mode 100644 tests/integration/recordings/responses/0fda25b9241c.json
 create mode 100644 tests/integration/recordings/responses/2b2ad549510d.json
 create mode 100644 tests/integration/recordings/responses/57b67d1b1a36.json
 create mode 100644 tests/integration/recordings/responses/8752115f8d0c.json
 create mode 100644 tests/integration/recordings/responses/94d11daee205.json
 create mode 100644 tests/integration/recordings/responses/9f3d749cc1c8.json
 create mode 100644 tests/integration/recordings/responses/c791119e6359.json
 create mode 100644 tests/integration/recordings/responses/d3e27b7234e2.json
 create mode 100644 tests/integration/recordings/responses/fb785db7fafd.json
 create mode 100644 tests/integration/recordings/responses/ff3271401fb4.json

diff --git a/docs/source/providers/inference/index.md b/docs/source/providers/inference/index.md
index b6d215474..c5720daef 100644
--- a/docs/source/providers/inference/index.md
+++ b/docs/source/providers/inference/index.md
@@ -18,6 +18,7 @@ This section contains documentation for all available providers for the **infere
 inline_meta-reference
 inline_sentence-transformers
 remote_anthropic
+remote_azure
 remote_bedrock
 remote_cerebras
 remote_databricks
diff --git a/docs/source/providers/inference/remote_azure.md b/docs/source/providers/inference/remote_azure.md
new file mode 100644
index 000000000..19f8f418b
--- /dev/null
+++ b/docs/source/providers/inference/remote_azure.md
@@ -0,0 +1,29 @@
+# remote::azure
+
+## Description
+
+
+Azure OpenAI inference provider for accessing GPT models and other Azure services.
+Provider documentation
+https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
+
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `api_key` | `<class 'pydantic.types.SecretStr'>` | No |  | Azure API key for Azure |
+| `api_base` | `<class 'pydantic.networks.HttpUrl'>` | No |  | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) |
+| `api_version` | `str \| None` | No |  | Azure API version for Azure (e.g., 2024-12-01-preview) |
+| `api_type` | `str \| None` | No | azure | Azure API type for Azure (e.g., azure) |
+
+## Sample Configuration
+
+```yaml
+api_key: ${env.AZURE_API_KEY:=}
+api_base: ${env.AZURE_API_BASE:=}
+api_version: ${env.AZURE_API_VERSION:=}
+api_type: ${env.AZURE_API_TYPE:=}
+
+```
+
diff --git a/llama_stack/distributions/ci-tests/build.yaml b/llama_stack/distributions/ci-tests/build.yaml
index 8e6c0bf67..a4d920cd6 100644
--- a/llama_stack/distributions/ci-tests/build.yaml
+++ b/llama_stack/distributions/ci-tests/build.yaml
@@ -17,6 +17,7 @@ distribution_spec:
     - provider_type: remote::vertexai
     - provider_type: remote::groq
     - provider_type: remote::sambanova
+    - provider_type: remote::azure
     - provider_type: inline::sentence-transformers
     vector_io:
     - provider_type: inline::faiss
diff --git a/llama_stack/distributions/ci-tests/run.yaml b/llama_stack/distributions/ci-tests/run.yaml
index 26a677c7a..a478a3872 100644
--- a/llama_stack/distributions/ci-tests/run.yaml
+++ b/llama_stack/distributions/ci-tests/run.yaml
@@ -81,6 +81,13 @@ providers:
     config:
       url: https://api.sambanova.ai/v1
       api_key: ${env.SAMBANOVA_API_KEY:=}
+  - provider_id: ${env.AZURE_API_KEY:+azure}
+    provider_type: remote::azure
+    config:
+      api_key: ${env.AZURE_API_KEY:=}
+      api_base: ${env.AZURE_API_BASE:=}
+      api_version: ${env.AZURE_API_VERSION:=}
+      api_type: ${env.AZURE_API_TYPE:=}
   - provider_id: sentence-transformers
     provider_type: inline::sentence-transformers
   vector_io:
diff --git a/llama_stack/distributions/starter-gpu/build.yaml b/llama_stack/distributions/starter-gpu/build.yaml
index ff7c58e6f..05a2bf180 100644
--- a/llama_stack/distributions/starter-gpu/build.yaml
+++ b/llama_stack/distributions/starter-gpu/build.yaml
@@ -18,6 +18,7 @@ distribution_spec:
     - provider_type: remote::vertexai
     - provider_type: remote::groq
     - provider_type: remote::sambanova
+    - provider_type: remote::azure
     - provider_type: inline::sentence-transformers
     vector_io:
     - provider_type: inline::faiss
diff --git a/llama_stack/distributions/starter-gpu/run.yaml b/llama_stack/distributions/starter-gpu/run.yaml
index 5d9dfcb27..786506706 100644
--- a/llama_stack/distributions/starter-gpu/run.yaml
+++ b/llama_stack/distributions/starter-gpu/run.yaml
@@ -81,6 +81,13 @@ providers:
     config:
       url: https://api.sambanova.ai/v1
       api_key: ${env.SAMBANOVA_API_KEY:=}
+  - provider_id: ${env.AZURE_API_KEY:+azure}
+    provider_type: remote::azure
+    config:
+      api_key: ${env.AZURE_API_KEY:=}
+      api_base: ${env.AZURE_API_BASE:=}
+      api_version: ${env.AZURE_API_VERSION:=}
+      api_type: ${env.AZURE_API_TYPE:=}
   - provider_id: sentence-transformers
     provider_type: inline::sentence-transformers
   vector_io:
diff --git a/llama_stack/distributions/starter/build.yaml b/llama_stack/distributions/starter/build.yaml
index e84e528da..2f0cd24fd 100644
--- a/llama_stack/distributions/starter/build.yaml
+++ b/llama_stack/distributions/starter/build.yaml
@@ -18,6 +18,7 @@ distribution_spec:
     - provider_type: remote::vertexai
     - provider_type: remote::groq
     - provider_type: remote::sambanova
+    - provider_type: remote::azure
     - provider_type: inline::sentence-transformers
     vector_io:
     - provider_type: inline::faiss
diff --git a/llama_stack/distributions/starter/run.yaml b/llama_stack/distributions/starter/run.yaml
index a3962b8aa..2814b2ced 100644
--- a/llama_stack/distributions/starter/run.yaml
+++ b/llama_stack/distributions/starter/run.yaml
@@ -81,6 +81,13 @@ providers:
     config:
       url: https://api.sambanova.ai/v1
       api_key: ${env.SAMBANOVA_API_KEY:=}
+  - provider_id: ${env.AZURE_API_KEY:+azure}
+    provider_type: remote::azure
+    config:
+      api_key: ${env.AZURE_API_KEY:=}
+      api_base: ${env.AZURE_API_BASE:=}
+      api_version: ${env.AZURE_API_VERSION:=}
+      api_type: ${env.AZURE_API_TYPE:=}
   - provider_id: sentence-transformers
     provider_type: inline::sentence-transformers
   vector_io:
diff --git a/llama_stack/distributions/starter/starter.py b/llama_stack/distributions/starter/starter.py
index 2fca52700..c2dfe95ad 100644
--- a/llama_stack/distributions/starter/starter.py
+++ b/llama_stack/distributions/starter/starter.py
@@ -59,6 +59,7 @@ ENABLED_INFERENCE_PROVIDERS = [
     "cerebras",
     "nvidia",
     "bedrock",
+    "azure",
 ]
 
 INFERENCE_PROVIDER_IDS = {
@@ -68,6 +69,7 @@ INFERENCE_PROVIDER_IDS = {
     "cerebras": "${env.CEREBRAS_API_KEY:+cerebras}",
     "nvidia": "${env.NVIDIA_API_KEY:+nvidia}",
     "vertexai": "${env.VERTEX_AI_PROJECT:+vertexai}",
+    "azure": "${env.AZURE_API_KEY:+azure}",
 }
 
 
@@ -277,5 +279,21 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
                 "http://localhost:11434",
                 "Ollama URL",
             ),
+            "AZURE_API_KEY": (
+                "",
+                "Azure API Key",
+            ),
+            "AZURE_API_BASE": (
+                "",
+                "Azure API Base",
+            ),
+            "AZURE_API_VERSION": (
+                "",
+                "Azure API Version",
+            ),
+            "AZURE_API_TYPE": (
+                "azure",
+                "Azure API Type",
+            ),
         },
     )
diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py
index 8912560cb..64196152b 100644
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@@ -295,4 +295,19 @@ Available Models:
                 description="IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform.",
             ),
         ),
+        remote_provider_spec(
+            api=Api.inference,
+            adapter=AdapterSpec(
+                adapter_type="azure",
+                pip_packages=["litellm"],
+                module="llama_stack.providers.remote.inference.azure",
+                config_class="llama_stack.providers.remote.inference.azure.AzureConfig",
+                provider_data_validator="llama_stack.providers.remote.inference.azure.config.AzureProviderDataValidator",
+                description="""
+Azure OpenAI inference provider for accessing GPT models and other Azure services.
+Provider documentation
+https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
+""",
+            ),
+        ),
     ]
diff --git a/llama_stack/providers/remote/inference/azure/__init__.py b/llama_stack/providers/remote/inference/azure/__init__.py
new file mode 100644
index 000000000..87bcaf309
--- /dev/null
+++ b/llama_stack/providers/remote/inference/azure/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import AzureConfig
+
+
+async def get_adapter_impl(config: AzureConfig, _deps):
+    from .azure import AzureInferenceAdapter
+
+    impl = AzureInferenceAdapter(config)
+    await impl.initialize()
+    return impl
diff --git a/llama_stack/providers/remote/inference/azure/azure.py b/llama_stack/providers/remote/inference/azure/azure.py
new file mode 100644
index 000000000..449bbbb1c
--- /dev/null
+++ b/llama_stack/providers/remote/inference/azure/azure.py
@@ -0,0 +1,64 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+from urllib.parse import urljoin
+
+from llama_stack.apis.inference import ChatCompletionRequest
+from llama_stack.providers.utils.inference.litellm_openai_mixin import (
+    LiteLLMOpenAIMixin,
+)
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
+
+from .config import AzureConfig
+from .models import MODEL_ENTRIES
+
+
+class AzureInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
+    def __init__(self, config: AzureConfig) -> None:
+        LiteLLMOpenAIMixin.__init__(
+            self,
+            MODEL_ENTRIES,
+            litellm_provider_name="azure",
+            api_key_from_config=config.api_key.get_secret_value(),
+            provider_data_api_key_field="azure_api_key",
+            openai_compat_api_base=str(config.api_base),
+        )
+        self.config = config
+
+    # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
+    get_api_key = LiteLLMOpenAIMixin.get_api_key
+
+    def get_base_url(self) -> str:
+        """
+        Get the Azure API base URL.
+
+        Returns the Azure API base URL from the configuration.
+        """
+        return urljoin(str(self.config.api_base), "/openai/v1")
+
+    async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]:
+        # Get base parameters from parent
+        params = await super()._get_params(request)
+
+        # Add Azure specific parameters
+        provider_data = self.get_request_provider_data()
+        if provider_data:
+            if getattr(provider_data, "azure_api_key", None):
+                params["api_key"] = provider_data.azure_api_key
+            if getattr(provider_data, "azure_api_base", None):
+                params["api_base"] = provider_data.azure_api_base
+            if getattr(provider_data, "azure_api_version", None):
+                params["api_version"] = provider_data.azure_api_version
+            if getattr(provider_data, "azure_api_type", None):
+                params["api_type"] = provider_data.azure_api_type
+        else:
+            params["api_key"] = self.config.api_key.get_secret_value()
+            params["api_base"] = str(self.config.api_base)
+            params["api_version"] = self.config.api_version
+            params["api_type"] = self.config.api_type
+
+        return params
diff --git a/llama_stack/providers/remote/inference/azure/config.py b/llama_stack/providers/remote/inference/azure/config.py
new file mode 100644
index 000000000..fe9d61d53
--- /dev/null
+++ b/llama_stack/providers/remote/inference/azure/config.py
@@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+from typing import Any
+
+from pydantic import BaseModel, Field, HttpUrl, SecretStr
+
+from llama_stack.schema_utils import json_schema_type
+
+
+class AzureProviderDataValidator(BaseModel):
+    azure_api_key: SecretStr = Field(
+        description="Azure API key for Azure",
+    )
+    azure_api_base: HttpUrl = Field(
+        description="Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com)",
+    )
+    azure_api_version: str | None = Field(
+        default=None,
+        description="Azure API version for Azure (e.g., 2024-06-01)",
+    )
+    azure_api_type: str | None = Field(
+        default="azure",
+        description="Azure API type for Azure (e.g., azure)",
+    )
+
+
+@json_schema_type
+class AzureConfig(BaseModel):
+    api_key: SecretStr = Field(
+        description="Azure API key for Azure",
+    )
+    api_base: HttpUrl = Field(
+        description="Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com)",
+    )
+    api_version: str | None = Field(
+        default_factory=lambda: os.getenv("AZURE_API_VERSION"),
+        description="Azure API version for Azure (e.g., 2024-12-01-preview)",
+    )
+    api_type: str | None = Field(
+        default_factory=lambda: os.getenv("AZURE_API_TYPE", "azure"),
+        description="Azure API type for Azure (e.g., azure)",
+    )
+
+    @classmethod
+    def sample_run_config(
+        cls,
+        api_key: str = "${env.AZURE_API_KEY:=}",
+        api_base: str = "${env.AZURE_API_BASE:=}",
+        api_version: str = "${env.AZURE_API_VERSION:=}",
+        api_type: str = "${env.AZURE_API_TYPE:=}",
+        **kwargs,
+    ) -> dict[str, Any]:
+        return {
+            "api_key": api_key,
+            "api_base": api_base,
+            "api_version": api_version,
+            "api_type": api_type,
+        }
diff --git a/llama_stack/providers/remote/inference/azure/models.py b/llama_stack/providers/remote/inference/azure/models.py
new file mode 100644
index 000000000..64c87969b
--- /dev/null
+++ b/llama_stack/providers/remote/inference/azure/models.py
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.providers.utils.inference.model_registry import (
+    ProviderModelEntry,
+)
+
+# https://learn.microsoft.com/en-us/azure/ai-foundry/openai/concepts/models?tabs=global-standard%2Cstandard-chat-completions
+LLM_MODEL_IDS = [
+    "gpt-5",
+    "gpt-5-mini",
+    "gpt-5-nano",
+    "gpt-5-chat",
+    "o1",
+    "o1-mini",
+    "o3-mini",
+    "o4-mini",
+    "gpt-4.1",
+    "gpt-4.1-mini",
+    "gpt-4.1-nano",
+]
+
+SAFETY_MODELS_ENTRIES = list[ProviderModelEntry]()
+
+MODEL_ENTRIES = [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS] + SAFETY_MODELS_ENTRIES
diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py
index f9c837ebd..22dec8876 100644
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@@ -6,12 +6,25 @@
 
 
 import time
+import unicodedata
 
 import pytest
 
 from ..test_cases.test_case import TestCase
 
 
+def _normalize_text(text: str) -> str:
+    """
+    Normalize Unicode text by removing diacritical marks for comparison.
+
+    The test case streaming_01 expects the answer "Sol" for the question "What's the name of the Sun
+    in latin?", but the model is returning "sōl" (with a macron over the 'o'), which is the correct
+    Latin spelling. The test is failing because it's doing a simple case-insensitive string search
+    for "sol" but the actual response contains the diacritical mark.
+    """
+    return unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode("ascii").lower()
+
+
 def provider_from_model(client_with_models, model_id):
     models = {m.identifier: m for m in client_with_models.models.list()}
     models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
@@ -42,6 +55,10 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
         "remote::groq",
         "remote::gemini",  # https://generativelanguage.googleapis.com/v1beta/openai/completions -> 404
         "remote::anthropic",  # at least claude-3-{5,7}-{haiku,sonnet}-* / claude-{sonnet,opus}-4-* are not supported
+        "remote::azure",  # {'error': {'code': 'OperationNotSupported', 'message': 'The completion operation
+        #  does not work with the specified model, gpt-5-mini. Please choose different model and try
+        #  again. You can learn more about which models can be used with each operation here:
+        #  https://go.microsoft.com/fwlink/?linkid=2197993.'}}"}
     ):
         pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
 
@@ -157,7 +174,8 @@ def test_openai_completion_non_streaming_suffix(llama_stack_client, client_with_
     assert len(response.choices) > 0
     choice = response.choices[0]
     assert len(choice.text) > 5
-    assert "france" in choice.text.lower()
+    normalized_text = _normalize_text(choice.text)
+    assert "france" in normalized_text
 
 
 @pytest.mark.parametrize(
@@ -248,7 +266,9 @@ def test_openai_chat_completion_non_streaming(compat_client, client_with_models,
     )
     message_content = response.choices[0].message.content.lower().strip()
     assert len(message_content) > 0
-    assert expected.lower() in message_content
+    normalized_expected = _normalize_text(expected)
+    normalized_content = _normalize_text(message_content)
+    assert normalized_expected in normalized_content
 
 
 @pytest.mark.parametrize(
@@ -272,10 +292,13 @@ def test_openai_chat_completion_streaming(compat_client, client_with_models, tex
     )
     streamed_content = []
     for chunk in response:
-        if chunk.choices[0].delta.content:
+        # On some providers like Azure, the choices are empty on the first chunk, so we need to check for that
+        if chunk.choices and len(chunk.choices) > 0 and chunk.choices[0].delta.content:
             streamed_content.append(chunk.choices[0].delta.content.lower().strip())
     assert len(streamed_content) > 0
-    assert expected.lower() in "".join(streamed_content)
+    normalized_expected = _normalize_text(expected)
+    normalized_content = _normalize_text("".join(streamed_content))
+    assert normalized_expected in normalized_content
 
 
 @pytest.mark.parametrize(
@@ -308,8 +331,12 @@ def test_openai_chat_completion_streaming_with_n(compat_client, client_with_mode
                     streamed_content.get(choice.index, "") + choice.delta.content.lower().strip()
                 )
     assert len(streamed_content) == 2
+    normalized_expected = _normalize_text(expected)
     for i, content in streamed_content.items():
-        assert expected.lower() in content, f"Choice {i}: Expected {expected.lower()} in {content}"
+        normalized_content = _normalize_text(content)
+        assert normalized_expected in normalized_content, (
+            f"Choice {i}: Expected {normalized_expected} in {normalized_content}"
+        )
 
 
 @pytest.mark.parametrize(
@@ -339,9 +366,9 @@ def test_inference_store(compat_client, client_with_models, text_model_id, strea
         content = ""
         response_id = None
         for chunk in response:
-            if response_id is None:
+            if response_id is None and chunk.id:
                 response_id = chunk.id
-            if chunk.choices[0].delta.content:
+            if chunk.choices and len(chunk.choices) > 0 and chunk.choices[0].delta.content:
                 content += chunk.choices[0].delta.content
     else:
         response_id = response.id
@@ -410,11 +437,12 @@ def test_inference_store_tool_calls(compat_client, client_with_models, text_mode
         content = ""
         response_id = None
         for chunk in response:
-            if response_id is None:
+            if response_id is None and chunk.id:
                 response_id = chunk.id
-            if delta := chunk.choices[0].delta:
-                if delta.content:
-                    content += delta.content
+            if chunk.choices and len(chunk.choices) > 0:
+                if delta := chunk.choices[0].delta:
+                    if delta.content:
+                        content += delta.content
     else:
         response_id = response.id
         content = response.choices[0].message.content
@@ -484,4 +512,5 @@ def test_openai_chat_completion_non_streaming_with_file(openai_client, client_wi
         stream=False,
     )
     message_content = response.choices[0].message.content.lower().strip()
-    assert "hello world" in message_content
+    normalized_content = _normalize_text(message_content)
+    assert "hello world" in normalized_content
diff --git a/tests/integration/inference/test_text_inference.py b/tests/integration/inference/test_text_inference.py
index d7ffe5929..621084231 100644
--- a/tests/integration/inference/test_text_inference.py
+++ b/tests/integration/inference/test_text_inference.py
@@ -32,6 +32,7 @@ def skip_if_model_doesnt_support_completion(client_with_models, model_id):
             "remote::vertexai",
             "remote::groq",
             "remote::sambanova",
+            "remote::azure",
         )
         or "openai-compat" in provider.provider_type
     ):
@@ -44,7 +45,7 @@ def skip_if_model_doesnt_support_json_schema_structured_output(client_with_model
     provider_id = models[model_id].provider_id
     providers = {p.provider_id: p for p in client_with_models.providers.list()}
     provider = providers[provider_id]
-    if provider.provider_type in ("remote::sambanova",):
+    if provider.provider_type in ("remote::sambanova", "remote::azure"):
         pytest.skip(
             f"Model {model_id} hosted by {provider.provider_type} doesn't support json_schema structured output"
         )
diff --git a/tests/integration/recordings/responses/0fda25b9241c.json b/tests/integration/recordings/responses/0fda25b9241c.json
new file mode 100644
index 000000000..b97ee1670
--- /dev/null
+++ b/tests/integration/recordings/responses/0fda25b9241c.json
@@ -0,0 +1,71 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-5-mini",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Which planet do humans live on?"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-5-mini"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-CECIXqfvjuluKkZtG3q2QJoSQhBU0",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "Humans live on Earth \u2014 the third planet from the Sun. It's the only known planet that naturally supports life, with a breathable atmosphere, liquid water, and temperatures suitable for living organisms.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": [],
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            },
+            "content_filter_results": {}
+          }
+        ],
+        "created": 1757499901,
+        "model": "gpt-5-mini-2025-08-07",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 112,
+          "prompt_tokens": 13,
+          "total_tokens": 125,
+          "completion_tokens_details": {
+            "accepted_prediction_tokens": 0,
+            "audio_tokens": 0,
+            "reasoning_tokens": 64,
+            "rejected_prediction_tokens": 0
+          },
+          "prompt_tokens_details": {
+            "audio_tokens": 0,
+            "cached_tokens": 0
+          }
+        },
+        "prompt_filter_results": [
+          {
+            "prompt_index": 0,
+            "content_filter_results": {}
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
diff --git a/tests/integration/recordings/responses/2b2ad549510d.json b/tests/integration/recordings/responses/2b2ad549510d.json
new file mode 100644
index 000000000..55a9d6426
--- /dev/null
+++ b/tests/integration/recordings/responses/2b2ad549510d.json
@@ -0,0 +1,448 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-5-mini",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Hello, world!"
+        }
+      ],
+      "stream": true
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-5-mini"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "",
+          "choices": [],
+          "created": 0,
+          "model": "",
+          "object": "",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null,
+          "prompt_filter_results": [
+            {
+              "prompt_index": 0,
+              "content_filter_results": {}
+            }
+          ]
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": "Hello",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": " world",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": "!",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": " Hi",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": " \u2014",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": " how",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": " can",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": " I",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": " help",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": " you",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": " today",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": "?",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
diff --git a/tests/integration/recordings/responses/57b67d1b1a36.json b/tests/integration/recordings/responses/57b67d1b1a36.json
new file mode 100644
index 000000000..14de1d85e
--- /dev/null
+++ b/tests/integration/recordings/responses/57b67d1b1a36.json
@@ -0,0 +1,71 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-5-mini",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Which planet has rings around it with a name starting with letter S?"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-5-mini"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-CECIkT5cbqFazpungtewksVePcUNa",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "Saturn. It's the planet famous for its prominent ring system made of ice and rock.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": [],
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            },
+            "content_filter_results": {}
+          }
+        ],
+        "created": 1757499914,
+        "model": "gpt-5-mini-2025-08-07",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 156,
+          "prompt_tokens": 20,
+          "total_tokens": 176,
+          "completion_tokens_details": {
+            "accepted_prediction_tokens": 0,
+            "audio_tokens": 0,
+            "reasoning_tokens": 128,
+            "rejected_prediction_tokens": 0
+          },
+          "prompt_tokens_details": {
+            "audio_tokens": 0,
+            "cached_tokens": 0
+          }
+        },
+        "prompt_filter_results": [
+          {
+            "prompt_index": 0,
+            "content_filter_results": {}
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
diff --git a/tests/integration/recordings/responses/8752115f8d0c.json b/tests/integration/recordings/responses/8752115f8d0c.json
new file mode 100644
index 000000000..0e88bbfa6
--- /dev/null
+++ b/tests/integration/recordings/responses/8752115f8d0c.json
@@ -0,0 +1,71 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-5-mini",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Hello, world!"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-5-mini"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-CECIuyylsMNXspa83k8LrD8SQadNY",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "Hello! \ud83d\udc4b How can I help you today \u2014 answer a question, write or edit something, debug code, brainstorm ideas, or anything else?",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": [],
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            },
+            "content_filter_results": {}
+          }
+        ],
+        "created": 1757499924,
+        "model": "gpt-5-mini-2025-08-07",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 40,
+          "prompt_tokens": 10,
+          "total_tokens": 50,
+          "completion_tokens_details": {
+            "accepted_prediction_tokens": 0,
+            "audio_tokens": 0,
+            "reasoning_tokens": 0,
+            "rejected_prediction_tokens": 0
+          },
+          "prompt_tokens_details": {
+            "audio_tokens": 0,
+            "cached_tokens": 0
+          }
+        },
+        "prompt_filter_results": [
+          {
+            "prompt_index": 0,
+            "content_filter_results": {}
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
diff --git a/tests/integration/recordings/responses/94d11daee205.json b/tests/integration/recordings/responses/94d11daee205.json
new file mode 100644
index 000000000..b6a6c3d68
--- /dev/null
+++ b/tests/integration/recordings/responses/94d11daee205.json
@@ -0,0 +1,1178 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-5-mini",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What is the name of the US captial?"
+        }
+      ],
+      "n": 2,
+      "stream": true
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-5-mini"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "",
+          "choices": [],
+          "created": 0,
+          "model": "",
+          "object": "",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null,
+          "prompt_filter_results": [
+            {
+              "prompt_index": 0,
+              "content_filter_results": {}
+            }
+          ]
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": "The",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " capital",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " of",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " the",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " United",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " States",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": "The",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " capital",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " of",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " the",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " United",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " is",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " States",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " Washington",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " is",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " Washington",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " D",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": ".C",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " D",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": ".",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": ".C",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " (",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": ".",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": "the",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " (",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " District",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": "official",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " of",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": "ly",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " Columbia",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " the",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": ").",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " District",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " of",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": " Columbia",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": ").",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499919,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
diff --git a/tests/integration/recordings/responses/9f3d749cc1c8.json b/tests/integration/recordings/responses/9f3d749cc1c8.json
new file mode 100644
index 000000000..9a4539ab0
--- /dev/null
+++ b/tests/integration/recordings/responses/9f3d749cc1c8.json
@@ -0,0 +1,1150 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-5-mini",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What's the name of the Sun in latin?"
+        }
+      ],
+      "stream": true
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-5-mini"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "",
+          "choices": [],
+          "created": 0,
+          "model": "",
+          "object": "",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null,
+          "prompt_filter_results": [
+            {
+              "prompt_index": 0,
+              "content_filter_results": {}
+            }
+          ]
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": "The",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " Latin",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " name",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " is",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " \"",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": "Sol",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": "\"",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " (",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": "gen",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": "itive",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " \"",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": "S",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": "olis",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": "\").",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " It's",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " used",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " as",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " the",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " proper",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " name",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " of",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " the",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " Sun",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": ";",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " poets",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " also",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " sometimes",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " used",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " Greek",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": "-derived",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " ep",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": "ithe",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": "ts",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " like",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": " \"",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": "Pho",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": "eb",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": "us",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": ".\"",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499903,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
diff --git a/tests/integration/recordings/responses/c791119e6359.json b/tests/integration/recordings/responses/c791119e6359.json
new file mode 100644
index 000000000..6ac123e92
--- /dev/null
+++ b/tests/integration/recordings/responses/c791119e6359.json
@@ -0,0 +1,98 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-5-mini",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What's the weather in Tokyo? Use the get_weather function to get the weather."
+        }
+      ],
+      "stream": false,
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the weather in a given city",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "city": {
+                  "type": "string",
+                  "description": "The city to get the weather for"
+                }
+              }
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-5-mini"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-CECIwq9Odd0mOJMmw7ytv8iEazH4H",
+        "choices": [
+          {
+            "finish_reason": "tool_calls",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": null,
+              "refusal": null,
+              "role": "assistant",
+              "annotations": [],
+              "audio": null,
+              "function_call": null,
+              "tool_calls": [
+                {
+                  "id": "call_yw18spRc1jjUlEyabbXBhB33",
+                  "function": {
+                    "arguments": "{\"city\":\"Tokyo\"}",
+                    "name": "get_weather"
+                  },
+                  "type": "function"
+                }
+              ]
+            },
+            "content_filter_results": {}
+          }
+        ],
+        "created": 1757499926,
+        "model": "gpt-5-mini-2025-08-07",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 88,
+          "prompt_tokens": 151,
+          "total_tokens": 239,
+          "completion_tokens_details": {
+            "accepted_prediction_tokens": 0,
+            "audio_tokens": 0,
+            "reasoning_tokens": 64,
+            "rejected_prediction_tokens": 0
+          },
+          "prompt_tokens_details": {
+            "audio_tokens": 0,
+            "cached_tokens": 0
+          }
+        },
+        "prompt_filter_results": [
+          {
+            "prompt_index": 0,
+            "content_filter_results": {}
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
diff --git a/tests/integration/recordings/responses/d3e27b7234e2.json b/tests/integration/recordings/responses/d3e27b7234e2.json
new file mode 100644
index 000000000..7f266c392
--- /dev/null
+++ b/tests/integration/recordings/responses/d3e27b7234e2.json
@@ -0,0 +1,2150 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-5-mini",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What's the name of the Sun in latin?"
+        }
+      ],
+      "n": 2,
+      "stream": true
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-5-mini"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "",
+          "choices": [],
+          "created": 0,
+          "model": "",
+          "object": "",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null,
+          "prompt_filter_results": [
+            {
+              "prompt_index": 0,
+              "content_filter_results": {}
+            }
+          ]
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "In",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " Latin",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " the",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " Sun",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " is",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " called",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " \"",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "Sol",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "\"",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " (",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "sol",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " gen",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "itive",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " sol",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "is",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "The",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " Latin",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " name",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " is",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " masculine",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " \"",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": ").",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "Sol",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "\"",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " (",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " The",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " name",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " is",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "s",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " also",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "\u014d",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " used",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "l",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " for",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " the",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "),",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " gen",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " Roman",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "itive",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " sun",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " \"",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " god",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "s",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " (",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "\u014d",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "e",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "lis",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": ".g",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "\".",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": ".,",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " ",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " Sol",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " As",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " Inv",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " an",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "ict",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " epit",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "us",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "het",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": ").",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " it",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "\u2019s",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " also",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " called",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " \"",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "Pho",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "eb",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "us",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": "\"",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " in",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": " poetry",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": ".",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 1,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499907,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
diff --git a/tests/integration/recordings/responses/fb785db7fafd.json b/tests/integration/recordings/responses/fb785db7fafd.json
new file mode 100644
index 000000000..086d211e8
--- /dev/null
+++ b/tests/integration/recordings/responses/fb785db7fafd.json
@@ -0,0 +1,310 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-5-mini",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What's the weather in Tokyo? Use the get_weather function to get the weather."
+        }
+      ],
+      "stream": true,
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the weather in a given city",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "city": {
+                  "type": "string",
+                  "description": "The city to get the weather for"
+                }
+              }
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-5-mini"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "",
+          "choices": [],
+          "created": 0,
+          "model": "",
+          "object": "",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null,
+          "prompt_filter_results": [
+            {
+              "prompt_index": 0,
+              "content_filter_results": {}
+            }
+          ]
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": "call_TMbEoYn9q0ZKtoxav5LpD9Ts",
+                    "function": {
+                      "arguments": "",
+                      "name": "get_weather"
+                    },
+                    "type": "function"
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499912,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "{\"",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499912,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "city",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499912,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "\":\"",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499912,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "Tokyo",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499912,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "\"}",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499912,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "tool_calls",
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499912,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
diff --git a/tests/integration/recordings/responses/ff3271401fb4.json b/tests/integration/recordings/responses/ff3271401fb4.json
new file mode 100644
index 000000000..bf7ec89f7
--- /dev/null
+++ b/tests/integration/recordings/responses/ff3271401fb4.json
@@ -0,0 +1,556 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-5-mini",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What is the name of the US captial?"
+        }
+      ],
+      "stream": true
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-5-mini"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "",
+          "choices": [],
+          "created": 0,
+          "model": "",
+          "object": "",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null,
+          "prompt_filter_results": [
+            {
+              "prompt_index": 0,
+              "content_filter_results": {}
+            }
+          ]
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": "The",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " capital",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " of",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " the",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " United",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " States",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " is",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " Washington",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " D",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": ".C",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": ".",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " (",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": "District",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " of",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " Columbia",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": ").",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}

From d15368a3026450d1474f4a4db47b89fd3e6057ca Mon Sep 17 00:00:00 2001
From: Francisco Arceo <arceofrancisco@gmail.com>
Date: Thu, 11 Sep 2025 06:20:11 -0600
Subject: [PATCH 18/30] chore: Updating documentation, adding exception
 handling for Vector Stores in RAG Tool, more tests on migration, and migrate
 off of inference_api for context_retriever for RAG (#3367)

# What does this PR do?

- Updating documentation on migration from RAG Tool to Vector Stores and
Files APIs
- Adding exception handling for Vector Stores in RAG Tool
- Add more tests on migration from RAG Tool to Vector Stores
- Migrate off of inference_api for context_retriever for RAG

<!-- If resolving an issue, uncomment and update the line below -->
<!-- Closes #[issue-number] -->

## Test Plan
Integration and unit tests added

Signed-off-by: Francisco Javier Arceo <farceo@redhat.com>
---
 docs/source/building_applications/rag.md      |  21 ++
 .../tool_runtime/rag/context_retriever.py     |  12 +-
 .../inline/tool_runtime/rag/memory.py         | 121 ++++++----
 .../integration/tool_runtime/test_rag_tool.py | 208 ++++++++++++++++++
 .../utils/memory/test_vector_store.py         |  38 ++++
 5 files changed, 355 insertions(+), 45 deletions(-)

diff --git a/docs/source/building_applications/rag.md b/docs/source/building_applications/rag.md
index 289c38991..802859e87 100644
--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@@ -93,10 +93,31 @@ chunks_response = client.vector_io.query(
 
 ### Using the RAG Tool
 
+> **⚠️ DEPRECATION NOTICE**: The RAG Tool is being deprecated in favor of directly using the OpenAI-compatible Search
+> API. We recommend migrating to the OpenAI APIs for better compatibility and future support.
+
 A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc.
 and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the
 [appendix](#more-ragdocument-examples).
 
+#### OpenAI API Integration & Migration
+
+The RAG tool has been updated to use OpenAI-compatible APIs. This provides several benefits:
+
+- **Files API Integration**: Documents are now uploaded using OpenAI's file upload endpoints
+- **Vector Stores API**: Vector storage operations use OpenAI's vector store format with configurable chunking strategies
+- **Error Resilience:** When processing multiple documents, individual failures are logged but don't crash the operation. Failed documents are skipped while successful ones continue processing.
+
+**Migration Path:**
+We recommend migrating to the OpenAI-compatible Search API for:
+1. **Better OpenAI Ecosystem Integration**: Direct compatibility with OpenAI tools and workflows including the Responses API
+2**Future-Proof**: Continued support and feature development
+3**Full OpenAI Compatibility**: Vector Stores, Files, and Search APIs are fully compatible with OpenAI's Responses API
+
+The OpenAI APIs are used under the hood, so you can continue to use your existing RAG Tool code with minimal changes.
+However, we recommend updating your code to use the new OpenAI-compatible APIs for better long-term support. If any
+documents  fail to process, they will be logged in the response but will not cause the entire operation to fail.
+
 ```python
 from llama_stack_client import RAGDocument
 
diff --git a/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py b/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py
index be18430e4..9bc22f979 100644
--- a/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py
@@ -8,7 +8,7 @@
 from jinja2 import Template
 
 from llama_stack.apis.common.content_types import InterleavedContent
-from llama_stack.apis.inference import UserMessage
+from llama_stack.apis.inference import OpenAIUserMessageParam
 from llama_stack.apis.tools.rag_tool import (
     DefaultRAGQueryGeneratorConfig,
     LLMRAGQueryGeneratorConfig,
@@ -61,16 +61,16 @@ async def llm_rag_query_generator(
         messages = [interleaved_content_as_str(content)]
 
     template = Template(config.template)
-    content = template.render({"messages": messages})
+    rendered_content: str = template.render({"messages": messages})
 
     model = config.model
-    message = UserMessage(content=content)
-    response = await inference_api.chat_completion(
-        model_id=model,
+    message = OpenAIUserMessageParam(content=rendered_content)
+    response = await inference_api.openai_chat_completion(
+        model=model,
         messages=[message],
         stream=False,
     )
 
-    query = response.completion_message.content
+    query = response.choices[0].message.content
 
     return query
diff --git a/llama_stack/providers/inline/tool_runtime/rag/memory.py b/llama_stack/providers/inline/tool_runtime/rag/memory.py
index aa629cca8..bc68f198d 100644
--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@@ -45,10 +45,7 @@ from llama_stack.apis.vector_io import (
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
 from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
-from llama_stack.providers.utils.memory.vector_store import (
-    content_from_doc,
-    parse_data_url,
-)
+from llama_stack.providers.utils.memory.vector_store import parse_data_url
 
 from .config import RagToolRuntimeConfig
 from .context_retriever import generate_rag_query
@@ -60,6 +57,47 @@ def make_random_string(length: int = 8):
     return "".join(secrets.choice(string.ascii_letters + string.digits) for _ in range(length))
 
 
+async def raw_data_from_doc(doc: RAGDocument) -> tuple[bytes, str]:
+    """Get raw binary data and mime type from a RAGDocument for file upload."""
+    if isinstance(doc.content, URL):
+        if doc.content.uri.startswith("data:"):
+            parts = parse_data_url(doc.content.uri)
+            mime_type = parts["mimetype"]
+            data = parts["data"]
+
+            if parts["is_base64"]:
+                file_data = base64.b64decode(data)
+            else:
+                file_data = data.encode("utf-8")
+
+            return file_data, mime_type
+        else:
+            async with httpx.AsyncClient() as client:
+                r = await client.get(doc.content.uri)
+                r.raise_for_status()
+                mime_type = r.headers.get("content-type", "application/octet-stream")
+                return r.content, mime_type
+    else:
+        if isinstance(doc.content, str):
+            content_str = doc.content
+        else:
+            content_str = interleaved_content_as_str(doc.content)
+
+        if content_str.startswith("data:"):
+            parts = parse_data_url(content_str)
+            mime_type = parts["mimetype"]
+            data = parts["data"]
+
+            if parts["is_base64"]:
+                file_data = base64.b64decode(data)
+            else:
+                file_data = data.encode("utf-8")
+
+            return file_data, mime_type
+        else:
+            return content_str.encode("utf-8"), "text/plain"
+
+
 class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRuntime):
     def __init__(
         self,
@@ -95,46 +133,52 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
             return
 
         for doc in documents:
-            if isinstance(doc.content, URL):
-                if doc.content.uri.startswith("data:"):
-                    parts = parse_data_url(doc.content.uri)
-                    file_data = base64.b64decode(parts["data"]) if parts["is_base64"] else parts["data"].encode()
-                    mime_type = parts["mimetype"]
-                else:
-                    async with httpx.AsyncClient() as client:
-                        response = await client.get(doc.content.uri)
-                        file_data = response.content
-                        mime_type = doc.mime_type or response.headers.get("content-type", "application/octet-stream")
-            else:
-                content_str = await content_from_doc(doc)
-                file_data = content_str.encode("utf-8")
-                mime_type = doc.mime_type or "text/plain"
+            try:
+                try:
+                    file_data, mime_type = await raw_data_from_doc(doc)
+                except Exception as e:
+                    log.error(f"Failed to extract content from document {doc.document_id}: {e}")
+                    continue
 
-            file_extension = mimetypes.guess_extension(mime_type) or ".txt"
-            filename = doc.metadata.get("filename", f"{doc.document_id}{file_extension}")
+                file_extension = mimetypes.guess_extension(mime_type) or ".txt"
+                filename = doc.metadata.get("filename", f"{doc.document_id}{file_extension}")
 
-            file_obj = io.BytesIO(file_data)
-            file_obj.name = filename
+                file_obj = io.BytesIO(file_data)
+                file_obj.name = filename
 
-            upload_file = UploadFile(file=file_obj, filename=filename)
+                upload_file = UploadFile(file=file_obj, filename=filename)
 
-            created_file = await self.files_api.openai_upload_file(
-                file=upload_file, purpose=OpenAIFilePurpose.ASSISTANTS
-            )
+                try:
+                    created_file = await self.files_api.openai_upload_file(
+                        file=upload_file, purpose=OpenAIFilePurpose.ASSISTANTS
+                    )
+                except Exception as e:
+                    log.error(f"Failed to upload file for document {doc.document_id}: {e}")
+                    continue
 
-            chunking_strategy = VectorStoreChunkingStrategyStatic(
-                static=VectorStoreChunkingStrategyStaticConfig(
-                    max_chunk_size_tokens=chunk_size_in_tokens,
-                    chunk_overlap_tokens=chunk_size_in_tokens // 4,
+                chunking_strategy = VectorStoreChunkingStrategyStatic(
+                    static=VectorStoreChunkingStrategyStaticConfig(
+                        max_chunk_size_tokens=chunk_size_in_tokens,
+                        chunk_overlap_tokens=chunk_size_in_tokens // 4,
+                    )
                 )
-            )
 
-            await self.vector_io_api.openai_attach_file_to_vector_store(
-                vector_store_id=vector_db_id,
-                file_id=created_file.id,
-                attributes=doc.metadata,
-                chunking_strategy=chunking_strategy,
-            )
+                try:
+                    await self.vector_io_api.openai_attach_file_to_vector_store(
+                        vector_store_id=vector_db_id,
+                        file_id=created_file.id,
+                        attributes=doc.metadata,
+                        chunking_strategy=chunking_strategy,
+                    )
+                except Exception as e:
+                    log.error(
+                        f"Failed to attach file {created_file.id} to vector store {vector_db_id} for document {doc.document_id}: {e}"
+                    )
+                    continue
+
+            except Exception as e:
+                log.error(f"Unexpected error processing document {doc.document_id}: {e}")
+                continue
 
     async def query(
         self,
@@ -274,7 +318,6 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
         if query_config:
             query_config = TypeAdapter(RAGQueryConfig).validate_python(query_config)
         else:
-            # handle someone passing an empty dict
             query_config = RAGQueryConfig()
 
         query = kwargs["query"]
@@ -285,6 +328,6 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
         )
 
         return ToolInvocationResult(
-            content=result.content,
+            content=result.content or [],
             metadata=result.metadata,
         )
diff --git a/tests/integration/tool_runtime/test_rag_tool.py b/tests/integration/tool_runtime/test_rag_tool.py
index b208500d8..b78c39af8 100644
--- a/tests/integration/tool_runtime/test_rag_tool.py
+++ b/tests/integration/tool_runtime/test_rag_tool.py
@@ -183,6 +183,110 @@ def test_vector_db_insert_from_url_and_query(
     assert any("llama2" in chunk.content.lower() for chunk in response2.chunks)
 
 
+def test_rag_tool_openai_apis(client_with_empty_registry, embedding_model_id, embedding_dimension):
+    vector_db_id = "test_openai_vector_db"
+
+    client_with_empty_registry.vector_dbs.register(
+        vector_db_id=vector_db_id,
+        embedding_model=embedding_model_id,
+        embedding_dimension=embedding_dimension,
+    )
+
+    available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
+    actual_vector_db_id = available_vector_dbs[0]
+
+    # different document formats that should work with OpenAI APIs
+    documents = [
+        Document(
+            document_id="text-doc",
+            content="This is a plain text document about machine learning algorithms.",
+            metadata={"type": "text", "category": "AI"},
+        ),
+        Document(
+            document_id="url-doc",
+            content="https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/chat.rst",
+            mime_type="text/plain",
+            metadata={"type": "url", "source": "pytorch"},
+        ),
+        Document(
+            document_id="data-url-doc",
+            content="data:text/plain;base64,VGhpcyBpcyBhIGRhdGEgVVJMIGRvY3VtZW50IGFib3V0IGRlZXAgbGVhcm5pbmcu",  # "This is a data URL document about deep learning."
+            metadata={"type": "data_url", "encoding": "base64"},
+        ),
+    ]
+
+    client_with_empty_registry.tool_runtime.rag_tool.insert(
+        documents=documents,
+        vector_db_id=actual_vector_db_id,
+        chunk_size_in_tokens=256,
+    )
+
+    files_list = client_with_empty_registry.files.list()
+    assert len(files_list.data) >= len(documents), (
+        f"Expected at least {len(documents)} files, got {len(files_list.data)}"
+    )
+
+    vector_store_files = client_with_empty_registry.vector_io.openai_list_files_in_vector_store(
+        vector_store_id=actual_vector_db_id
+    )
+    assert len(vector_store_files.data) >= len(documents), f"Expected at least {len(documents)} files in vector store"
+
+    response = client_with_empty_registry.tool_runtime.rag_tool.query(
+        vector_db_ids=[actual_vector_db_id],
+        content="Tell me about machine learning and deep learning",
+    )
+
+    assert_valid_text_response(response)
+    content_text = " ".join([chunk.text for chunk in response.content]).lower()
+    assert "machine learning" in content_text or "deep learning" in content_text
+
+
+def test_rag_tool_exception_handling(client_with_empty_registry, embedding_model_id, embedding_dimension):
+    vector_db_id = "test_exception_handling"
+
+    client_with_empty_registry.vector_dbs.register(
+        vector_db_id=vector_db_id,
+        embedding_model=embedding_model_id,
+        embedding_dimension=embedding_dimension,
+    )
+
+    available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
+    actual_vector_db_id = available_vector_dbs[0]
+
+    documents = [
+        Document(
+            document_id="valid-doc",
+            content="This is a valid document that should be processed successfully.",
+            metadata={"status": "valid"},
+        ),
+        Document(
+            document_id="invalid-url-doc",
+            content="https://nonexistent-domain-12345.com/invalid.txt",
+            metadata={"status": "invalid_url"},
+        ),
+        Document(
+            document_id="another-valid-doc",
+            content="This is another valid document for testing resilience.",
+            metadata={"status": "valid"},
+        ),
+    ]
+
+    client_with_empty_registry.tool_runtime.rag_tool.insert(
+        documents=documents,
+        vector_db_id=actual_vector_db_id,
+        chunk_size_in_tokens=256,
+    )
+
+    response = client_with_empty_registry.tool_runtime.rag_tool.query(
+        vector_db_ids=[actual_vector_db_id],
+        content="valid document",
+    )
+
+    assert_valid_text_response(response)
+    content_text = " ".join([chunk.text for chunk in response.content]).lower()
+    assert "valid document" in content_text
+
+
 def test_rag_tool_insert_and_query(client_with_empty_registry, embedding_model_id, embedding_dimension):
     providers = [p for p in client_with_empty_registry.providers.list() if p.api == "vector_io"]
     assert len(providers) > 0
@@ -249,3 +353,107 @@ def test_rag_tool_insert_and_query(client_with_empty_registry, embedding_model_i
                 "chunk_template": "This should raise a ValueError because it is missing the proper template variables",
             },
         )
+
+
+def test_rag_tool_query_generation(client_with_empty_registry, embedding_model_id, embedding_dimension):
+    vector_db_id = "test_query_generation_db"
+
+    client_with_empty_registry.vector_dbs.register(
+        vector_db_id=vector_db_id,
+        embedding_model=embedding_model_id,
+        embedding_dimension=embedding_dimension,
+    )
+
+    available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
+    actual_vector_db_id = available_vector_dbs[0]
+
+    documents = [
+        Document(
+            document_id="ai-doc",
+            content="Artificial intelligence and machine learning are transforming technology.",
+            metadata={"category": "AI"},
+        ),
+        Document(
+            document_id="banana-doc",
+            content="Don't bring a banana to a knife fight.",
+            metadata={"category": "wisdom"},
+        ),
+    ]
+
+    client_with_empty_registry.tool_runtime.rag_tool.insert(
+        documents=documents,
+        vector_db_id=actual_vector_db_id,
+        chunk_size_in_tokens=256,
+    )
+
+    response = client_with_empty_registry.tool_runtime.rag_tool.query(
+        vector_db_ids=[actual_vector_db_id],
+        content="Tell me about AI",
+    )
+
+    assert_valid_text_response(response)
+    content_text = " ".join([chunk.text for chunk in response.content]).lower()
+    assert "artificial intelligence" in content_text or "machine learning" in content_text
+
+
+def test_rag_tool_pdf_data_url_handling(client_with_empty_registry, embedding_model_id, embedding_dimension):
+    vector_db_id = "test_pdf_data_url_db"
+
+    client_with_empty_registry.vector_dbs.register(
+        vector_db_id=vector_db_id,
+        embedding_model=embedding_model_id,
+        embedding_dimension=embedding_dimension,
+    )
+
+    available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
+    actual_vector_db_id = available_vector_dbs[0]
+
+    sample_pdf = b"%PDF-1.3\n3 0 obj\n<</Type /Page\n/Parent 1 0 R\n/Resources 2 0 R\n/Contents 4 0 R>>\nendobj\n4 0 obj\n<</Filter /FlateDecode /Length 115>>\nstream\nx\x9c\x15\xcc1\x0e\x820\x18@\xe1\x9dS\xbcM]jk$\xd5\xd5(\x83!\x86\xa1\x17\xf8\xa3\xa5`LIh+\xd7W\xc6\xf7\r\xef\xc0\xbd\xd2\xaa\xb6,\xd5\xc5\xb1o\x0c\xa6VZ\xe3znn%\xf3o\xab\xb1\xe7\xa3:Y\xdc\x8bm\xeb\xf3&1\xc8\xd7\xd3\x97\xc82\xe6\x81\x87\xe42\xcb\x87Vb(\x12<\xdd<=}Jc\x0cL\x91\xee\xda$\xb5\xc3\xbd\xd7\xe9\x0f\x8d\x97 $\nendstream\nendobj\n1 0 obj\n<</Type /Pages\n/Kids [3 0 R ]\n/Count 1\n/MediaBox [0 0 595.28 841.89]\n>>\nendobj\n5 0 obj\n<</Type /Font\n/BaseFont /Helvetica\n/Subtype /Type1\n/Encoding /WinAnsiEncoding\n>>\nendobj\n2 0 obj\n<<\n/ProcSet [/PDF /Text /ImageB /ImageC /ImageI]\n/Font <<\n/F1 5 0 R\n>>\n/XObject <<\n>>\n>>\nendobj\n6 0 obj\n<<\n/Producer (PyFPDF 1.7.2 http://pyfpdf.googlecode.com/)\n/Title (This is a sample title.)\n/Author (Llama Stack Developers)\n/CreationDate (D:20250312165548)\n>>\nendobj\n7 0 obj\n<<\n/Type /Catalog\n/Pages 1 0 R\n/OpenAction [3 0 R /FitH null]\n/PageLayout /OneColumn\n>>\nendobj\nxref\n0 8\n0000000000 65535 f \n0000000272 00000 n \n0000000455 00000 n \n0000000009 00000 n \n0000000087 00000 n \n0000000359 00000 n \n0000000559 00000 n \n0000000734 00000 n \ntrailer\n<<\n/Size 8\n/Root 7 0 R\n/Info 6 0 R\n>>\nstartxref\n837\n%%EOF\n"
+
+    import base64
+
+    pdf_base64 = base64.b64encode(sample_pdf).decode("utf-8")
+    pdf_data_url = f"data:application/pdf;base64,{pdf_base64}"
+
+    documents = [
+        Document(
+            document_id="test-pdf-data-url",
+            content=pdf_data_url,
+            metadata={"type": "pdf", "source": "data_url"},
+        ),
+    ]
+
+    client_with_empty_registry.tool_runtime.rag_tool.insert(
+        documents=documents,
+        vector_db_id=actual_vector_db_id,
+        chunk_size_in_tokens=256,
+    )
+
+    files_list = client_with_empty_registry.files.list()
+    assert len(files_list.data) >= 1, "PDF should have been uploaded to Files API"
+
+    pdf_file = None
+    for file in files_list.data:
+        if file.filename and "test-pdf-data-url" in file.filename:
+            pdf_file = file
+            break
+
+    assert pdf_file is not None, "PDF file should be found in Files API"
+    assert pdf_file.bytes == len(sample_pdf), f"File size should match original PDF ({len(sample_pdf)} bytes)"
+
+    file_content = client_with_empty_registry.files.retrieve_content(pdf_file.id)
+    assert file_content.startswith(b"%PDF-"), "Retrieved file should be a valid PDF"
+
+    vector_store_files = client_with_empty_registry.vector_io.openai_list_files_in_vector_store(
+        vector_store_id=actual_vector_db_id
+    )
+    assert len(vector_store_files.data) >= 1, "PDF should be attached to vector store"
+
+    response = client_with_empty_registry.tool_runtime.rag_tool.query(
+        vector_db_ids=[actual_vector_db_id],
+        content="sample title",
+    )
+
+    assert_valid_text_response(response)
+    content_text = " ".join([chunk.text for chunk in response.content]).lower()
+    assert "sample title" in content_text or "title" in content_text
diff --git a/tests/unit/providers/utils/memory/test_vector_store.py b/tests/unit/providers/utils/memory/test_vector_store.py
index 90b229262..590bdd1d2 100644
--- a/tests/unit/providers/utils/memory/test_vector_store.py
+++ b/tests/unit/providers/utils/memory/test_vector_store.py
@@ -178,3 +178,41 @@ def test_content_from_data_and_mime_type_both_encodings_fail():
         # Should raise an exception instead of returning empty string
         with pytest.raises(UnicodeDecodeError):
             content_from_data_and_mime_type(data, mime_type)
+
+
+async def test_memory_tool_error_handling():
+    """Test that memory tool handles various failures gracefully without crashing."""
+    from llama_stack.providers.inline.tool_runtime.rag.config import RagToolRuntimeConfig
+    from llama_stack.providers.inline.tool_runtime.rag.memory import MemoryToolRuntimeImpl
+
+    config = RagToolRuntimeConfig()
+    memory_tool = MemoryToolRuntimeImpl(
+        config=config,
+        vector_io_api=AsyncMock(),
+        inference_api=AsyncMock(),
+        files_api=AsyncMock(),
+    )
+
+    docs = [
+        RAGDocument(document_id="good_doc", content="Good content", metadata={}),
+        RAGDocument(document_id="bad_url_doc", content=URL(uri="https://bad.url"), metadata={}),
+        RAGDocument(document_id="another_good_doc", content="Another good content", metadata={}),
+    ]
+
+    mock_file1 = MagicMock()
+    mock_file1.id = "file_good1"
+    mock_file2 = MagicMock()
+    mock_file2.id = "file_good2"
+    memory_tool.files_api.openai_upload_file.side_effect = [mock_file1, mock_file2]
+
+    with patch("httpx.AsyncClient") as mock_client:
+        mock_instance = AsyncMock()
+        mock_instance.get.side_effect = Exception("Bad URL")
+        mock_client.return_value.__aenter__.return_value = mock_instance
+
+        # won't raise exception despite one document failing
+        await memory_tool.insert(docs, "vector_store_123")
+
+    # processed 2 documents successfully, skipped 1
+    assert memory_tool.files_api.openai_upload_file.call_count == 2
+    assert memory_tool.vector_io_api.openai_attach_file_to_vector_store.call_count == 2

From 8ef1189be7c6ea6e9fb2e3cf3f502123e0e4635a Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Thu, 11 Sep 2025 09:04:38 -0400
Subject: [PATCH 19/30] chore: update the vLLM inference impl to use
 OpenAIMixin for openai-compat functions (#3404)

# What does this PR do?

update vLLM inference provider to use OpenAIMixin for openai-compat
functions

inference recordings from Qwen3-0.6B and vLLM 0.8.3 -
```
docker run --gpus all -v ~/.cache/huggingface:/root/.cache/huggingface -p 8000:8000 --ipc=host \
    vllm/vllm-openai:latest \
    --model Qwen/Qwen3-0.6B --enable-auto-tool-choice --tool-call-parser hermes
```

## Test Plan

```
./scripts/integration-tests.sh --stack-config server:ci-tests --setup vllm --subdirs inference
```
---
 .../providers/remote/inference/vllm/vllm.py   | 197 +-----------------
 .../providers/utils/inference/openai_mixin.py |  28 ++-
 .../providers/inference/test_remote_vllm.py   |  21 +-
 3 files changed, 44 insertions(+), 202 deletions(-)

diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index 9e9a80ca5..77f5d82af 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import json
-from collections.abc import AsyncGenerator, AsyncIterator
+from collections.abc import AsyncGenerator
 from typing import Any
 
 import httpx
@@ -38,13 +38,6 @@ from llama_stack.apis.inference import (
     LogProbConfig,
     Message,
     ModelStore,
-    OpenAIChatCompletion,
-    OpenAICompletion,
-    OpenAIEmbeddingData,
-    OpenAIEmbeddingsResponse,
-    OpenAIEmbeddingUsage,
-    OpenAIMessageParam,
-    OpenAIResponseFormatParam,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -71,11 +64,11 @@ from llama_stack.providers.utils.inference.openai_compat import (
     convert_message_to_openai_dict,
     convert_tool_call,
     get_sampling_options,
-    prepare_openai_completion_params,
     process_chat_completion_stream_response,
     process_completion_response,
     process_completion_stream_response,
 )
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from llama_stack.providers.utils.inference.prompt_adapter import (
     completion_request_to_prompt,
     content_has_media,
@@ -288,7 +281,7 @@ async def _process_vllm_chat_completion_stream_response(
         yield c
 
 
-class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
+class VLLMInferenceAdapter(OpenAIMixin, Inference, ModelsProtocolPrivate):
     # automatically set by the resolver when instantiating the provider
     __provider_id__: str
     model_store: ModelStore | None = None
@@ -296,7 +289,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
     def __init__(self, config: VLLMInferenceAdapterConfig) -> None:
         self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
         self.config = config
-        self.client = None
 
     async def initialize(self) -> None:
         if not self.config.url:
@@ -308,8 +300,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
         return self.config.refresh_models
 
     async def list_models(self) -> list[Model] | None:
-        self._lazy_initialize_client()
-        assert self.client is not None  # mypy
         models = []
         async for m in self.client.models.list():
             model_type = ModelType.llm  # unclear how to determine embedding vs. llm models
@@ -340,8 +330,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
             HealthResponse: A dictionary containing the health status.
         """
         try:
-            client = self._create_client() if self.client is None else self.client
-            _ = [m async for m in client.models.list()]  # Ensure the client is initialized
+            _ = [m async for m in self.client.models.list()]  # Ensure the client is initialized
             return HealthResponse(status=HealthStatus.OK)
         except Exception as e:
             return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}")
@@ -351,19 +340,14 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
             raise ValueError("Model store not set")
         return await self.model_store.get_model(model_id)
 
-    def _lazy_initialize_client(self):
-        if self.client is not None:
-            return
+    def get_api_key(self):
+        return self.config.api_token
 
-        log.info(f"Initializing vLLM client with base_url={self.config.url}")
-        self.client = self._create_client()
+    def get_base_url(self):
+        return self.config.url
 
-    def _create_client(self):
-        return AsyncOpenAI(
-            base_url=self.config.url,
-            api_key=self.config.api_token,
-            http_client=httpx.AsyncClient(verify=self.config.tls_verify),
-        )
+    def get_extra_client_params(self):
+        return {"http_client": httpx.AsyncClient(verify=self.config.tls_verify)}
 
     async def completion(
         self,
@@ -374,7 +358,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
         stream: bool | None = False,
         logprobs: LogProbConfig | None = None,
     ) -> CompletionResponse | AsyncGenerator[CompletionResponseStreamChunk, None]:
-        self._lazy_initialize_client()
         if sampling_params is None:
             sampling_params = SamplingParams()
         model = await self._get_model(model_id)
@@ -406,7 +389,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
         logprobs: LogProbConfig | None = None,
         tool_config: ToolConfig | None = None,
     ) -> ChatCompletionResponse | AsyncGenerator[ChatCompletionResponseStreamChunk, None]:
-        self._lazy_initialize_client()
         if sampling_params is None:
             sampling_params = SamplingParams()
         model = await self._get_model(model_id)
@@ -479,16 +461,12 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
             yield chunk
 
     async def register_model(self, model: Model) -> Model:
-        # register_model is called during Llama Stack initialization, hence we cannot init self.client if not initialized yet.
-        # self.client should only be created after the initialization is complete to avoid asyncio cross-context errors.
-        # Changing this may lead to unpredictable behavior.
-        client = self._create_client() if self.client is None else self.client
         try:
             model = await self.register_helper.register_model(model)
         except ValueError:
             pass  # Ignore statically unknown model, will check live listing
         try:
-            res = await client.models.list()
+            res = await self.client.models.list()
         except APIConnectionError as e:
             raise ValueError(
                 f"Failed to connect to vLLM at {self.config.url}. Please check if vLLM is running and accessible at that URL."
@@ -543,8 +521,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
         output_dimension: int | None = None,
         task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
-        self._lazy_initialize_client()
-        assert self.client is not None
         model = await self._get_model(model_id)
 
         kwargs = {}
@@ -560,154 +536,3 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
 
         embeddings = [data.embedding for data in response.data]
         return EmbeddingsResponse(embeddings=embeddings)
-
-    async def openai_embeddings(
-        self,
-        model: str,
-        input: str | list[str],
-        encoding_format: str | None = "float",
-        dimensions: int | None = None,
-        user: str | None = None,
-    ) -> OpenAIEmbeddingsResponse:
-        self._lazy_initialize_client()
-        assert self.client is not None
-        model_obj = await self._get_model(model)
-        assert model_obj.model_type == ModelType.embedding
-
-        # Convert input to list if it's a string
-        input_list = [input] if isinstance(input, str) else input
-
-        # Call vLLM embeddings endpoint with encoding_format
-        response = await self.client.embeddings.create(
-            model=model_obj.provider_resource_id,
-            input=input_list,
-            dimensions=dimensions,
-            encoding_format=encoding_format,
-        )
-
-        # Convert response to OpenAI format
-        data = [
-            OpenAIEmbeddingData(
-                embedding=embedding_data.embedding,
-                index=i,
-            )
-            for i, embedding_data in enumerate(response.data)
-        ]
-
-        # Not returning actual token usage since vLLM doesn't provide it
-        usage = OpenAIEmbeddingUsage(prompt_tokens=-1, total_tokens=-1)
-
-        return OpenAIEmbeddingsResponse(
-            data=data,
-            model=model_obj.provider_resource_id,
-            usage=usage,
-        )
-
-    async def openai_completion(
-        self,
-        model: str,
-        prompt: str | list[str] | list[int] | list[list[int]],
-        best_of: int | None = None,
-        echo: bool | None = None,
-        frequency_penalty: float | None = None,
-        logit_bias: dict[str, float] | None = None,
-        logprobs: bool | None = None,
-        max_tokens: int | None = None,
-        n: int | None = None,
-        presence_penalty: float | None = None,
-        seed: int | None = None,
-        stop: str | list[str] | None = None,
-        stream: bool | None = None,
-        stream_options: dict[str, Any] | None = None,
-        temperature: float | None = None,
-        top_p: float | None = None,
-        user: str | None = None,
-        guided_choice: list[str] | None = None,
-        prompt_logprobs: int | None = None,
-        suffix: str | None = None,
-    ) -> OpenAICompletion:
-        self._lazy_initialize_client()
-        model_obj = await self._get_model(model)
-
-        extra_body: dict[str, Any] = {}
-        if prompt_logprobs is not None and prompt_logprobs >= 0:
-            extra_body["prompt_logprobs"] = prompt_logprobs
-        if guided_choice:
-            extra_body["guided_choice"] = guided_choice
-
-        params = await prepare_openai_completion_params(
-            model=model_obj.provider_resource_id,
-            prompt=prompt,
-            best_of=best_of,
-            echo=echo,
-            frequency_penalty=frequency_penalty,
-            logit_bias=logit_bias,
-            logprobs=logprobs,
-            max_tokens=max_tokens,
-            n=n,
-            presence_penalty=presence_penalty,
-            seed=seed,
-            stop=stop,
-            stream=stream,
-            stream_options=stream_options,
-            temperature=temperature,
-            top_p=top_p,
-            user=user,
-            extra_body=extra_body,
-        )
-        return await self.client.completions.create(**params)  # type: ignore
-
-    async def openai_chat_completion(
-        self,
-        model: str,
-        messages: list[OpenAIMessageParam],
-        frequency_penalty: float | None = None,
-        function_call: str | dict[str, Any] | None = None,
-        functions: list[dict[str, Any]] | None = None,
-        logit_bias: dict[str, float] | None = None,
-        logprobs: bool | None = None,
-        max_completion_tokens: int | None = None,
-        max_tokens: int | None = None,
-        n: int | None = None,
-        parallel_tool_calls: bool | None = None,
-        presence_penalty: float | None = None,
-        response_format: OpenAIResponseFormatParam | None = None,
-        seed: int | None = None,
-        stop: str | list[str] | None = None,
-        stream: bool | None = None,
-        stream_options: dict[str, Any] | None = None,
-        temperature: float | None = None,
-        tool_choice: str | dict[str, Any] | None = None,
-        tools: list[dict[str, Any]] | None = None,
-        top_logprobs: int | None = None,
-        top_p: float | None = None,
-        user: str | None = None,
-    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        self._lazy_initialize_client()
-        model_obj = await self._get_model(model)
-        params = await prepare_openai_completion_params(
-            model=model_obj.provider_resource_id,
-            messages=messages,
-            frequency_penalty=frequency_penalty,
-            function_call=function_call,
-            functions=functions,
-            logit_bias=logit_bias,
-            logprobs=logprobs,
-            max_completion_tokens=max_completion_tokens,
-            max_tokens=max_tokens,
-            n=n,
-            parallel_tool_calls=parallel_tool_calls,
-            presence_penalty=presence_penalty,
-            response_format=response_format,
-            seed=seed,
-            stop=stop,
-            stream=stream,
-            stream_options=stream_options,
-            temperature=temperature,
-            tool_choice=tool_choice,
-            tools=tools,
-            top_logprobs=top_logprobs,
-            top_p=top_p,
-            user=user,
-        )
-        return await self.client.chat.completions.create(**params)  # type: ignore
diff --git a/llama_stack/providers/utils/inference/openai_mixin.py b/llama_stack/providers/utils/inference/openai_mixin.py
index f60deee6e..a3c0ffadc 100644
--- a/llama_stack/providers/utils/inference/openai_mixin.py
+++ b/llama_stack/providers/utils/inference/openai_mixin.py
@@ -67,6 +67,17 @@ class OpenAIMixin(ABC):
         """
         pass
 
+    def get_extra_client_params(self) -> dict[str, Any]:
+        """
+        Get any extra parameters to pass to the AsyncOpenAI client.
+
+        Child classes can override this method to provide additional parameters
+        such as timeout settings, proxies, etc.
+
+        :return: A dictionary of extra parameters
+        """
+        return {}
+
     @property
     def client(self) -> AsyncOpenAI:
         """
@@ -78,6 +89,7 @@ class OpenAIMixin(ABC):
         return AsyncOpenAI(
             api_key=self.get_api_key(),
             base_url=self.get_base_url(),
+            **self.get_extra_client_params(),
         )
 
     async def _get_provider_model_id(self, model: str) -> str:
@@ -124,10 +136,15 @@ class OpenAIMixin(ABC):
         """
         Direct OpenAI completion API call.
         """
-        if guided_choice is not None:
-            logger.warning("guided_choice is not supported by the OpenAI API. Ignoring.")
-        if prompt_logprobs is not None:
-            logger.warning("prompt_logprobs is not supported by the OpenAI API. Ignoring.")
+        # Handle parameters that are not supported by OpenAI API, but may be by the provider
+        #  prompt_logprobs is supported by vLLM
+        #  guided_choice is supported by vLLM
+        # TODO: test coverage
+        extra_body: dict[str, Any] = {}
+        if prompt_logprobs is not None and prompt_logprobs >= 0:
+            extra_body["prompt_logprobs"] = prompt_logprobs
+        if guided_choice:
+            extra_body["guided_choice"] = guided_choice
 
         # TODO: fix openai_completion to return type compatible with OpenAI's API response
         return await self.client.completions.create(  # type: ignore[no-any-return]
@@ -150,7 +167,8 @@ class OpenAIMixin(ABC):
                 top_p=top_p,
                 user=user,
                 suffix=suffix,
-            )
+            ),
+            extra_body=extra_body,
         )
 
     async def openai_chat_completion(
diff --git a/tests/unit/providers/inference/test_remote_vllm.py b/tests/unit/providers/inference/test_remote_vllm.py
index ce0e930b1..a48af2a1d 100644
--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@@ -11,7 +11,7 @@ import threading
 import time
 from http.server import BaseHTTPRequestHandler, HTTPServer
 from typing import Any
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import AsyncMock, MagicMock, PropertyMock, patch
 
 import pytest
 from openai.types.chat.chat_completion_chunk import (
@@ -150,10 +150,12 @@ async def test_tool_call_response(vllm_inference_adapter):
     """Verify that tool call arguments from a CompletionMessage are correctly converted
     into the expected JSON format."""
 
-    # Patch the call to vllm so we can inspect the arguments sent were correct
-    with patch.object(
-        vllm_inference_adapter.client.chat.completions, "create", new_callable=AsyncMock
-    ) as mock_nonstream_completion:
+    # Patch the client property to avoid instantiating a real AsyncOpenAI client
+    with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client:
+        mock_client = MagicMock()
+        mock_client.chat.completions.create = AsyncMock()
+        mock_create_client.return_value = mock_client
+
         messages = [
             SystemMessage(content="You are a helpful assistant"),
             UserMessage(content="How many?"),
@@ -179,7 +181,7 @@ async def test_tool_call_response(vllm_inference_adapter):
             tool_config=ToolConfig(tool_choice=ToolChoice.auto),
         )
 
-        assert mock_nonstream_completion.call_args.kwargs["messages"][2]["tool_calls"] == [
+        assert mock_client.chat.completions.create.call_args.kwargs["messages"][2]["tool_calls"] == [
             {
                 "id": "foo",
                 "type": "function",
@@ -641,9 +643,7 @@ async def test_health_status_success(vllm_inference_adapter):
     This test verifies that the health method returns a HealthResponse with status OK, only
     when the connection to the vLLM server is successful.
     """
-    # Set vllm_inference_adapter.client to None to ensure _create_client is called
-    vllm_inference_adapter.client = None
-    with patch.object(vllm_inference_adapter, "_create_client") as mock_create_client:
+    with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client:
         # Create mock client and models
         mock_client = MagicMock()
         mock_models = MagicMock()
@@ -674,8 +674,7 @@ async def test_health_status_failure(vllm_inference_adapter):
     This test verifies that the health method returns a HealthResponse with status ERROR
     and an appropriate error message when the connection to the vLLM server fails.
     """
-    vllm_inference_adapter.client = None
-    with patch.object(vllm_inference_adapter, "_create_client") as mock_create_client:
+    with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client:
         # Create mock client and models
         mock_client = MagicMock()
         mock_models = MagicMock()

From 72387b4bd229bba60b43f95679da62630fc0f3c7 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Thu, 11 Sep 2025 11:45:16 -0400
Subject: [PATCH 20/30] chore(unit tests): remove network use, update async
 test (#3418)

# What does this PR do?

update the async detection test for vllm

- remove a network access from unit tests
- remove direct logging use

the idea behind the test is to mock inference w/ a sleep, initiate
concurrent inference calls, verify the total execution time is close to
the sleep time. in a non-async env the total time would be closer to
sleep * num concurrent calls.


## Test Plan

ci
---
 .../providers/inference/test_remote_vllm.py   | 160 +++++++-----------
 1 file changed, 60 insertions(+), 100 deletions(-)

diff --git a/tests/unit/providers/inference/test_remote_vllm.py b/tests/unit/providers/inference/test_remote_vllm.py
index a48af2a1d..61b16b5d1 100644
--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@@ -6,11 +6,7 @@
 
 import asyncio
 import json
-import logging  # allow-direct-logging
-import threading
 import time
-from http.server import BaseHTTPRequestHandler, HTTPServer
-from typing import Any
 from unittest.mock import AsyncMock, MagicMock, PropertyMock, patch
 
 import pytest
@@ -18,7 +14,7 @@ from openai.types.chat.chat_completion_chunk import (
     ChatCompletionChunk as OpenAIChatCompletionChunk,
 )
 from openai.types.chat.chat_completion_chunk import (
-    Choice as OpenAIChoice,
+    Choice as OpenAIChoiceChunk,
 )
 from openai.types.chat.chat_completion_chunk import (
     ChoiceDelta as OpenAIChoiceDelta,
@@ -35,6 +31,9 @@ from llama_stack.apis.inference import (
     ChatCompletionRequest,
     ChatCompletionResponseEventType,
     CompletionMessage,
+    OpenAIAssistantMessageParam,
+    OpenAIChatCompletion,
+    OpenAIChoice,
     SystemMessage,
     ToolChoice,
     ToolConfig,
@@ -61,41 +60,6 @@ from llama_stack.providers.remote.inference.vllm.vllm import (
 # -v -s --tb=short --disable-warnings
 
 
-class MockInferenceAdapterWithSleep:
-    def __init__(self, sleep_time: int, response: dict[str, Any]):
-        self.httpd = None
-
-        class DelayedRequestHandler(BaseHTTPRequestHandler):
-            # ruff: noqa: N802
-            def do_POST(self):
-                time.sleep(sleep_time)
-                response_body = json.dumps(response).encode("utf-8")
-                self.send_response(code=200)
-                self.send_header("Content-Type", "application/json")
-                self.send_header("Content-Length", len(response_body))
-                self.end_headers()
-                self.wfile.write(response_body)
-
-        self.request_handler = DelayedRequestHandler
-
-    def __enter__(self):
-        httpd = HTTPServer(("", 0), self.request_handler)
-        self.httpd = httpd
-        host, port = httpd.server_address
-        httpd_thread = threading.Thread(target=httpd.serve_forever)
-        httpd_thread.daemon = True  # stop server if this thread terminates
-        httpd_thread.start()
-
-        config = VLLMInferenceAdapterConfig(url=f"http://{host}:{port}")
-        inference_adapter = VLLMInferenceAdapter(config)
-        return inference_adapter
-
-    def __exit__(self, _exc_type, _exc_value, _traceback):
-        if self.httpd:
-            self.httpd.shutdown()
-            self.httpd.server_close()
-
-
 @pytest.fixture(scope="module")
 def mock_openai_models_list():
     with patch("openai.resources.models.AsyncModels.list", new_callable=AsyncMock) as mock_list:
@@ -201,7 +165,7 @@ async def test_tool_call_delta_empty_tool_call_buf():
 
     async def mock_stream():
         delta = OpenAIChoiceDelta(content="", tool_calls=None)
-        choices = [OpenAIChoice(delta=delta, finish_reason="stop", index=0)]
+        choices = [OpenAIChoiceChunk(delta=delta, finish_reason="stop", index=0)]
         mock_chunk = OpenAIChatCompletionChunk(
             id="chunk-1",
             created=1,
@@ -227,7 +191,7 @@ async def test_tool_call_delta_streaming_arguments_dict():
             model="foo",
             object="chat.completion.chunk",
             choices=[
-                OpenAIChoice(
+                OpenAIChoiceChunk(
                     delta=OpenAIChoiceDelta(
                         content="",
                         tool_calls=[
@@ -252,7 +216,7 @@ async def test_tool_call_delta_streaming_arguments_dict():
             model="foo",
             object="chat.completion.chunk",
             choices=[
-                OpenAIChoice(
+                OpenAIChoiceChunk(
                     delta=OpenAIChoiceDelta(
                         content="",
                         tool_calls=[
@@ -277,7 +241,9 @@ async def test_tool_call_delta_streaming_arguments_dict():
             model="foo",
             object="chat.completion.chunk",
             choices=[
-                OpenAIChoice(delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0)
+                OpenAIChoiceChunk(
+                    delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0
+                )
             ],
         )
         for chunk in [mock_chunk_1, mock_chunk_2, mock_chunk_3]:
@@ -301,7 +267,7 @@ async def test_multiple_tool_calls():
             model="foo",
             object="chat.completion.chunk",
             choices=[
-                OpenAIChoice(
+                OpenAIChoiceChunk(
                     delta=OpenAIChoiceDelta(
                         content="",
                         tool_calls=[
@@ -326,7 +292,7 @@ async def test_multiple_tool_calls():
             model="foo",
             object="chat.completion.chunk",
             choices=[
-                OpenAIChoice(
+                OpenAIChoiceChunk(
                     delta=OpenAIChoiceDelta(
                         content="",
                         tool_calls=[
@@ -351,7 +317,9 @@ async def test_multiple_tool_calls():
             model="foo",
             object="chat.completion.chunk",
             choices=[
-                OpenAIChoice(delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0)
+                OpenAIChoiceChunk(
+                    delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0
+                )
             ],
         )
         for chunk in [mock_chunk_1, mock_chunk_2, mock_chunk_3]:
@@ -395,59 +363,6 @@ async def test_process_vllm_chat_completion_stream_response_no_choices():
     assert chunks[0].event.event_type.value == "start"
 
 
-@pytest.mark.allow_network
-def test_chat_completion_doesnt_block_event_loop(caplog):
-    loop = asyncio.new_event_loop()
-    loop.set_debug(True)
-    caplog.set_level(logging.WARNING)
-
-    # Log when event loop is blocked for more than 200ms
-    loop.slow_callback_duration = 0.5
-    # Sleep for 500ms in our delayed http response
-    sleep_time = 0.5
-
-    mock_model = Model(identifier="mock-model", provider_resource_id="mock-model", provider_id="vllm-inference")
-    mock_response = {
-        "id": "chatcmpl-abc123",
-        "object": "chat.completion",
-        "created": 1,
-        "modle": "mock-model",
-        "choices": [
-            {
-                "message": {"content": ""},
-                "logprobs": None,
-                "finish_reason": "stop",
-                "index": 0,
-            }
-        ],
-    }
-
-    async def do_chat_completion():
-        await inference_adapter.chat_completion(
-            "mock-model",
-            [],
-            stream=False,
-            tools=None,
-            tool_config=ToolConfig(tool_choice=ToolChoice.auto),
-        )
-
-    with MockInferenceAdapterWithSleep(sleep_time, mock_response) as inference_adapter:
-        inference_adapter.model_store = AsyncMock()
-        inference_adapter.model_store.get_model.return_value = mock_model
-        loop.run_until_complete(inference_adapter.initialize())
-
-        # Clear the logs so far and run the actual chat completion we care about
-        caplog.clear()
-        loop.run_until_complete(do_chat_completion())
-
-    # Ensure we don't have any asyncio warnings in the captured log
-    # records from our chat completion call. A message gets logged
-    # here any time we exceed the slow_callback_duration configured
-    # above.
-    asyncio_warnings = [record.message for record in caplog.records if record.name == "asyncio"]
-    assert not asyncio_warnings
-
-
 async def test_get_params_empty_tools(vllm_inference_adapter):
     request = ChatCompletionRequest(
         tools=[],
@@ -696,3 +611,48 @@ async def test_health_status_failure(vllm_inference_adapter):
         assert "Health check failed: Connection failed" in health_response["message"]
 
         mock_models.list.assert_called_once()
+
+
+async def test_openai_chat_completion_is_async(vllm_inference_adapter):
+    """
+    Verify that openai_chat_completion is async and doesn't block the event loop.
+
+    To do this we mock the underlying inference with a sleep, start multiple
+    inference calls in parallel, and ensure the total time taken is less
+    than the sum of the individual sleep times.
+    """
+    sleep_time = 0.5
+
+    async def mock_create(*args, **kwargs):
+        await asyncio.sleep(sleep_time)
+        return OpenAIChatCompletion(
+            id="chatcmpl-abc123",
+            created=1,
+            model="mock-model",
+            choices=[
+                OpenAIChoice(
+                    message=OpenAIAssistantMessageParam(
+                        content="nothing interesting",
+                    ),
+                    finish_reason="stop",
+                    index=0,
+                )
+            ],
+        )
+
+    async def do_inference():
+        await vllm_inference_adapter.openai_chat_completion(
+            "mock-model", messages=["one fish", "two fish"], stream=False
+        )
+
+    with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client:
+        mock_client = MagicMock()
+        mock_client.chat.completions.create = AsyncMock(side_effect=mock_create)
+        mock_create_client.return_value = mock_client
+
+        start_time = time.time()
+        await asyncio.gather(do_inference(), do_inference(), do_inference(), do_inference())
+        total_time = time.time() - start_time
+
+        assert mock_create_client.call_count == 4  # no cheating
+        assert total_time < (sleep_time * 2), f"Total time taken: {total_time}s exceeded expected max"

From c7ef1f13df981622216833578c70d98f702d9cc6 Mon Sep 17 00:00:00 2001
From: slekkala1 <swapna942@meta.com>
Date: Thu, 11 Sep 2025 11:10:41 -0700
Subject: [PATCH 21/30] feat: Add langchain llamastack Integration example
 notebook (#3314)

# What does this PR do?
The notebook was
reverted(https://github.com/llamastack/llama-stack/pull/3259) as it had
some local paths, I missed correcting. Trying with corrections now


## Test Plan
Ran the Jupyter notebook
---
 .../langchain/Llama_Stack_LangChain.ipynb     | 701 ++++++++++++++++++
 1 file changed, 701 insertions(+)
 create mode 100644 docs/notebooks/langchain/Llama_Stack_LangChain.ipynb

diff --git a/docs/notebooks/langchain/Llama_Stack_LangChain.ipynb b/docs/notebooks/langchain/Llama_Stack_LangChain.ipynb
new file mode 100644
index 000000000..d44ac6994
--- /dev/null
+++ b/docs/notebooks/langchain/Llama_Stack_LangChain.ipynb
@@ -0,0 +1,701 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1ztegmwm4sp",
+   "metadata": {},
+   "source": [
+    "## LlamaStack + LangChain Integration Tutorial\n",
+    "\n",
+    "This notebook demonstrates how to integrate **LlamaStack** with **LangChain** to build a complete RAG (Retrieval-Augmented Generation) system.\n",
+    "\n",
+    "### Overview\n",
+    "\n",
+    "- **LlamaStack**: Provides the infrastructure for running LLMs and Open AI Compatible Vector Stores\n",
+    "- **LangChain**: Provides the framework for chaining operations and prompt templates\n",
+    "- **Integration**: Uses LlamaStack's OpenAI-compatible API with LangChain\n",
+    "\n",
+    "### What You'll See\n",
+    "\n",
+    "1. Setting up LlamaStack server with Fireworks AI provider\n",
+    "2. Creating and Querying Vector Stores\n",
+    "3. Building RAG chains with LangChain + LLAMAStack\n",
+    "4. Querying the chain for relevant information\n",
+    "\n",
+    "### Prerequisites\n",
+    "\n",
+    "- Fireworks API key\n",
+    "\n",
+    "---\n",
+    "\n",
+    "### 1. Installation and Setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ktr5ls2cas",
+   "metadata": {},
+   "source": [
+    "#### Install Required Dependencies\n",
+    "\n",
+    "First, we install all the necessary packages for LangChain and FastAPI integration."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5b6a6a17-b931-4bea-8273-0d6e5563637a",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: uv in /Users/swapna942/miniconda3/lib/python3.12/site-packages (0.7.20)\n",
+      "\u001b[2mUsing Python 3.12.11 environment at: /Users/swapna942/miniconda3\u001b[0m\n",
+      "\u001b[2mAudited \u001b[1m7 packages\u001b[0m \u001b[2min 42ms\u001b[0m\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install uv\n",
+    "!uv pip install fastapi uvicorn \"langchain>=0.2\" langchain-openai \\\n",
+    "             langchain-community langchain-text-splitters \\\n",
+    "             faiss-cpu"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "wmt9jvqzh7n",
+   "metadata": {},
+   "source": [
+    "### 2. LlamaStack Server Setup\n",
+    "\n",
+    "#### Build and Start LlamaStack Server\n",
+    "\n",
+    "This section sets up the LlamaStack server with:\n",
+    "- **Fireworks AI** as the inference provider\n",
+    "- **Sentence Transformers** for embeddings\n",
+    "\n",
+    "The server runs on `localhost:8321` and provides OpenAI-compatible endpoints."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "dd2dacf3-ec8b-4cc7-8ff4-b5b6ea4a6e9e",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import subprocess\n",
+    "import time\n",
+    "\n",
+    "# Remove UV_SYSTEM_PYTHON to ensure uv creates a proper virtual environment\n",
+    "# instead of trying to use system Python globally, which could cause permission issues\n",
+    "# and package conflicts with the system's Python installation\n",
+    "if \"UV_SYSTEM_PYTHON\" in os.environ:\n",
+    "    del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
+    "\n",
+    "def run_llama_stack_server_background():\n",
+    "    \"\"\"Build and run LlamaStack server in one step using --run flag\"\"\"\n",
+    "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
+    "    process = subprocess.Popen(\n",
+    "        \"uv run --with llama-stack llama stack build --distro starter --image-type venv --run\",\n",
+    "        shell=True,\n",
+    "        stdout=log_file,\n",
+    "        stderr=log_file,\n",
+    "        text=True,\n",
+    "    )\n",
+    "\n",
+    "    print(f\"Building and starting Llama Stack server with PID: {process.pid}\")\n",
+    "    return process\n",
+    "\n",
+    "\n",
+    "def wait_for_server_to_start():\n",
+    "    import requests\n",
+    "    from requests.exceptions import ConnectionError\n",
+    "\n",
+    "    url = \"http://0.0.0.0:8321/v1/health\"\n",
+    "    max_retries = 30\n",
+    "    retry_interval = 1\n",
+    "\n",
+    "    print(\"Waiting for server to start\", end=\"\")\n",
+    "    for _ in range(max_retries):\n",
+    "        try:\n",
+    "            response = requests.get(url)\n",
+    "            if response.status_code == 200:\n",
+    "                print(\"\\nServer is ready!\")\n",
+    "                return True\n",
+    "        except ConnectionError:\n",
+    "            print(\".\", end=\"\", flush=True)\n",
+    "            time.sleep(retry_interval)\n",
+    "\n",
+    "    print(\"\\nServer failed to start after\", max_retries * retry_interval, \"seconds\")\n",
+    "    return False\n",
+    "\n",
+    "\n",
+    "def kill_llama_stack_server():\n",
+    "    # Kill any existing llama stack server processes using pkill command\n",
+    "    os.system(\"pkill -f llama_stack.core.server.server\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "28bd8dbd-4576-4e76-813f-21ab94db44a2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Building and starting Llama Stack server with PID: 19747\n",
+      "Waiting for server to start....\n",
+      "Server is ready!\n"
+     ]
+    }
+   ],
+   "source": [
+    "server_process = run_llama_stack_server_background()\n",
+    "assert wait_for_server_to_start()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "gr9cdcg4r7n",
+   "metadata": {},
+   "source": [
+    "#### Install LlamaStack Client\n",
+    "\n",
+    "Install the client library to interact with the LlamaStack server."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "487d2dbc-d071-400e-b4f0-dcee58f8dc95",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2mUsing Python 3.12.11 environment at: /Users/swapna942/miniconda3\u001b[0m\n",
+      "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 27ms\u001b[0m\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!uv pip install llama_stack_client"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0j5hag7l9x89",
+   "metadata": {},
+   "source": [
+    "### 3. Initialize LlamaStack Client\n",
+    "\n",
+    "Create a client connection to the LlamaStack server with API keys for different providers:\n",
+    "\n",
+    "- **Fireworks API Key**: For Fireworks models\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "ab4eff97-4565-4c73-b1b3-0020a4c7e2a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_stack_client import LlamaStackClient\n",
+    "\n",
+    "client = LlamaStackClient(\n",
+    "    base_url=\"http://0.0.0.0:8321\",\n",
+    "    provider_data={\"fireworks_api_key\": \"***\"},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "vwhexjy1e8o",
+   "metadata": {},
+   "source": [
+    "#### Explore Available Models and Safety Features\n",
+    "\n",
+    "Check what models and safety shields are available through your LlamaStack instance."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "880443ef-ac3c-48b1-a80a-7dab5b25ac61",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:httpx:HTTP Request: GET http://0.0.0.0:8321/v1/models \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: GET http://0.0.0.0:8321/v1/shields \"HTTP/1.1 200 OK\"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Available Fireworks models:\n",
+      "- fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct\n",
+      "- fireworks/accounts/fireworks/models/llama-v3p1-70b-instruct\n",
+      "- fireworks/accounts/fireworks/models/llama-v3p1-405b-instruct\n",
+      "- fireworks/accounts/fireworks/models/llama-v3p2-3b-instruct\n",
+      "- fireworks/accounts/fireworks/models/llama-v3p2-11b-vision-instruct\n",
+      "- fireworks/accounts/fireworks/models/llama-v3p2-90b-vision-instruct\n",
+      "- fireworks/accounts/fireworks/models/llama-v3p3-70b-instruct\n",
+      "- fireworks/accounts/fireworks/models/llama4-scout-instruct-basic\n",
+      "- fireworks/accounts/fireworks/models/llama4-maverick-instruct-basic\n",
+      "- fireworks/nomic-ai/nomic-embed-text-v1.5\n",
+      "- fireworks/accounts/fireworks/models/llama-guard-3-8b\n",
+      "- fireworks/accounts/fireworks/models/llama-guard-3-11b-vision\n",
+      "----\n",
+      "Available shields (safety models):\n",
+      "code-scanner\n",
+      "llama-guard\n",
+      "nemo-guardrail\n",
+      "----\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Available Fireworks models:\")\n",
+    "for m in client.models.list():\n",
+    "    if m.identifier.startswith(\"fireworks/\"):\n",
+    "        print(f\"- {m.identifier}\")\n",
+    "\n",
+    "print(\"----\")\n",
+    "print(\"Available shields (safety models):\")\n",
+    "for s in client.shields.list():\n",
+    "    print(s.identifier)\n",
+    "print(\"----\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "gojp7at31ht",
+   "metadata": {},
+   "source": [
+    "### 4. Vector Store Setup\n",
+    "\n",
+    "#### Create a Vector Store with File Upload\n",
+    "\n",
+    "Create a vector store using the OpenAI-compatible vector stores API:\n",
+    "\n",
+    "- **Vector Store**: OpenAI-compatible vector store for document storage\n",
+    "- **File Upload**: Automatic chunking and embedding of uploaded files  \n",
+    "- **Embedding Model**: Sentence Transformers model for text embeddings\n",
+    "- **Dimensions**: 384-dimensional embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "be2c2899-ea53-4e5f-b6b8-ed425f5d6572",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/files \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/files \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/files \"HTTP/1.1 200 OK\"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File(id='file-54652c95c56c4c34918a97d7ff8a4320', bytes=41, created_at=1757442621, expires_at=1788978621, filename='shipping_policy.txt', object='file', purpose='assistants')\n",
+      "File(id='file-fb1227c1d1854da1bd774d21e5b7e41c', bytes=48, created_at=1757442621, expires_at=1788978621, filename='returns_policy.txt', object='file', purpose='assistants')\n",
+      "File(id='file-673f874852fe42798675a13d06a256e2', bytes=45, created_at=1757442621, expires_at=1788978621, filename='support.txt', object='file', purpose='assistants')\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores \"HTTP/1.1 200 OK\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "from io import BytesIO\n",
+    "\n",
+    "docs = [\n",
+    "    (\"Acme ships globally in 3-5 business days.\", {\"title\": \"Shipping Policy\"}),\n",
+    "    (\"Returns are accepted within 30 days of purchase.\", {\"title\": \"Returns Policy\"}),\n",
+    "    (\"Support is available 24/7 via chat and email.\", {\"title\": \"Support\"}),\n",
+    "]\n",
+    "\n",
+    "file_ids = []\n",
+    "for content, metadata in docs:\n",
+    "  with BytesIO(content.encode()) as file_buffer:\n",
+    "      file_buffer.name = f\"{metadata['title'].replace(' ', '_').lower()}.txt\"\n",
+    "      create_file_response = client.files.create(file=file_buffer, purpose=\"assistants\")\n",
+    "      print(create_file_response)\n",
+    "      file_ids.append(create_file_response.id)\n",
+    "\n",
+    "# Create vector store with files\n",
+    "vector_store = client.vector_stores.create(\n",
+    "  name=\"acme_docs\",\n",
+    "  file_ids=file_ids,\n",
+    "  embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\",\n",
+    "  embedding_dimension=384,\n",
+    "  provider_id=\"faiss\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9061tmi1zpq",
+   "metadata": {},
+   "source": [
+    "#### Test Vector Store Search\n",
+    "\n",
+    "Query the vector store. This performs semantic search to find relevant documents based on the query."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ba9d1901-bd5e-4216-b3e6-19dc74551cc6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores/vs_708c060b-45da-423e-8354-68529b4fd1a6/search \"HTTP/1.1 200 OK\"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Acme ships globally in 3-5 business days.\n",
+      "Returns are accepted within 30 days of purchase.\n"
+     ]
+    }
+   ],
+   "source": [
+    "search_response = client.vector_stores.search(\n",
+    "  vector_store_id=vector_store.id,\n",
+    "  query=\"How long does shipping take?\",\n",
+    "  max_num_results=2\n",
+    ")\n",
+    "for result in search_response.data:\n",
+    "  content = result.content[0].text\n",
+    "  print(content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "usne6mbspms",
+   "metadata": {},
+   "source": [
+    "### 5. LangChain Integration\n",
+    "\n",
+    "#### Configure LangChain with LlamaStack\n",
+    "\n",
+    "Set up LangChain to use LlamaStack's OpenAI-compatible API:\n",
+    "\n",
+    "- **Base URL**: Points to LlamaStack's OpenAI endpoint\n",
+    "- **Headers**: Include Fireworks API key for model access\n",
+    "- **Model**: Use Meta Llama v3p1 8b instruct model for inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "c378bd10-09c2-417c-bdfc-1e0a2dd19084",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "# Point LangChain to Llamastack Server\n",
+    "llm = ChatOpenAI(\n",
+    "    base_url=\"http://0.0.0.0:8321/v1/openai/v1\",\n",
+    "    api_key=\"dummy\",\n",
+    "    model=\"fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct\",\n",
+    "    default_headers={\"X-LlamaStack-Provider-Data\": '{\"fireworks_api_key\": \"***\"}'},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5a4ddpcuk3l",
+   "metadata": {},
+   "source": [
+    "#### Test LLM Connection\n",
+    "\n",
+    "Verify that LangChain can successfully communicate with the LlamaStack server."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "f88ffb5a-657b-4916-9375-c6ddc156c25e",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "AIMessage(content=\"A llama's gentle eyes shine bright,\\nIn the Andes, it roams through morning light.\", additional_kwargs={'refusal': None}, response_metadata={'token_usage': None, 'model_name': 'fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct', 'system_fingerprint': None, 'id': 'chatcmpl-602b5967-82a3-476b-9cd2-7d3b29b76ee8', 'service_tier': None, 'finish_reason': 'stop', 'logprobs': None}, id='run--0933c465-ff4d-4a7b-b7fb-fd97dd8244f3-0')"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Test llm with simple message\n",
+    "messages = [\n",
+    "    {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
+    "    {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n",
+    "]\n",
+    "llm.invoke(messages)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0xh0jg6a0l4a",
+   "metadata": {},
+   "source": [
+    "### 6. Building the RAG Chain\n",
+    "\n",
+    "#### Create a Complete RAG Pipeline\n",
+    "\n",
+    "Build a LangChain pipeline that combines:\n",
+    "\n",
+    "1. **Vector Search**: Query LlamaStack's Open AI compatible Vector Store\n",
+    "2. **Context Assembly**: Format retrieved documents\n",
+    "3. **Prompt Template**: Structure the input for the LLM\n",
+    "4. **LLM Generation**: Generate answers using context\n",
+    "5. **Output Parsing**: Extract the final response\n",
+    "\n",
+    "**Chain Flow**: `Query → Vector Search → Context + Question → LLM → Response`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "9684427d-dcc7-4544-9af5-8b110d014c42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# LangChain for prompt template and chaining + LLAMA Stack Client Vector DB and LLM chat completion\n",
+    "from langchain_core.output_parsers import StrOutputParser\n",
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "from langchain_core.runnables import RunnableLambda, RunnablePassthrough\n",
+    "\n",
+    "\n",
+    "def join_docs(docs):\n",
+    "    return \"\\n\\n\".join([f\"[{d.filename}] {d.content[0].text}\" for d in docs.data])\n",
+    "\n",
+    "PROMPT = ChatPromptTemplate.from_messages(\n",
+    "    [\n",
+    "        (\"system\", \"You are a helpful assistant. Use the following context to answer.\"),\n",
+    "        (\"user\", \"Question: {question}\\n\\nContext:\\n{context}\"),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "vector_step = RunnableLambda(\n",
+    "      lambda x: client.vector_stores.search(\n",
+    "          vector_store_id=vector_store.id,\n",
+    "          query=x,\n",
+    "          max_num_results=2\n",
+    "      )\n",
+    "  )\n",
+    "\n",
+    "chain = (\n",
+    "    {\"context\": vector_step | RunnableLambda(join_docs), \"question\": RunnablePassthrough()}\n",
+    "    | PROMPT\n",
+    "    | llm\n",
+    "    | StrOutputParser()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0onu6rhphlra",
+   "metadata": {},
+   "source": [
+    "### 7. Testing the RAG System\n",
+    "\n",
+    "#### Example 1: Shipping Query"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "03322188-9509-446a-a4a8-ce3bb83ec87c",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores/vs_708c060b-45da-423e-8354-68529b4fd1a6/search \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "❓ How long does shipping take?\n",
+      "💡 Acme ships globally in 3-5 business days. This means that shipping typically takes between 3 to 5 working days from the date of dispatch or order fulfillment.\n"
+     ]
+    }
+   ],
+   "source": [
+    "query = \"How long does shipping take?\"\n",
+    "response = chain.invoke(query)\n",
+    "print(\"❓\", query)\n",
+    "print(\"💡\", response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b7krhqj88ku",
+   "metadata": {},
+   "source": [
+    "#### Example 2: Returns Policy Query"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "61995550-bb0b-46a8-a5d0-023207475d60",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores/vs_708c060b-45da-423e-8354-68529b4fd1a6/search \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "❓ Can I return a product after 40 days?\n",
+      "💡 Based on the provided context, you cannot return a product after 40 days. The return window is limited to 30 days from the date of purchase.\n"
+     ]
+    }
+   ],
+   "source": [
+    "query = \"Can I return a product after 40 days?\"\n",
+    "response = chain.invoke(query)\n",
+    "print(\"❓\", query)\n",
+    "print(\"💡\", response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "h4w24fadvjs",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "We have successfully built a RAG system that combines:\n",
+    "\n",
+    "- **LlamaStack** for infrastructure (LLM serving + Vector Store)\n",
+    "- **LangChain** for orchestration (prompts + chains)\n",
+    "- **Fireworks** for high-quality language models\n",
+    "\n",
+    "### Key Benefits\n",
+    "\n",
+    "1. **Unified Infrastructure**: Single server for LLMs and Vector Store\n",
+    "2. **OpenAI Compatibility**: Easy integration with existing LangChain code\n",
+    "3. **Multi-Provider Support**: Switch between different LLM providers\n",
+    "4. **Production Ready**: Built-in safety shields and monitoring\n",
+    "\n",
+    "### Next Steps\n",
+    "\n",
+    "- Add more sophisticated document processing\n",
+    "- Implement conversation memory\n",
+    "- Add safety filtering and monitoring\n",
+    "- Scale to larger document collections\n",
+    "- Integrate with web frameworks like FastAPI or Streamlit\n",
+    "\n",
+    "---\n",
+    "\n",
+    "##### 🔧 Cleanup\n",
+    "\n",
+    "Don't forget to stop the LlamaStack server when you're done:\n",
+    "\n",
+    "```python\n",
+    "kill_llama_stack_server()\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "15647c46-22ce-4698-af3f-8161329d8e3a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kill_llama_stack_server()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 69a52213a190bddcf118bb13206353ff4b30d33d Mon Sep 17 00:00:00 2001
From: Charlie Doern <cdoern@redhat.com>
Date: Thu, 11 Sep 2025 16:30:09 -0400
Subject: [PATCH 22/30] fix: oasdiff enhancements and stability (#3419)

# What does this PR do?

only run conformance tests when the spec is changed.

Also, cache oasdiff such that it is not installed every time the test is
run

Signed-off-by: Charlie Doern <cdoern@redhat.com>
---
 .github/workflows/conformance.yml | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/conformance.yml b/.github/workflows/conformance.yml
index c0a7795a3..c7962c93d 100644
--- a/.github/workflows/conformance.yml
+++ b/.github/workflows/conformance.yml
@@ -13,11 +13,8 @@ on:
     branches: [ main ]
     types: [opened, synchronize, reopened]
     paths:
-      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
-      - 'tests/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
+      - 'docs/_static/llama-stack-spec.yaml'
+      - 'docs/_static/llama-stack-spec.html'
       - '.github/workflows/conformance.yml' # This workflow itself
 
 concurrency:
@@ -43,10 +40,27 @@ jobs:
           ref: ${{ github.event.pull_request.base.ref }}
           path: 'base'
 
+      # Cache oasdiff to avoid checksum failures and speed up builds
+      - name: Cache oasdiff
+        id: cache-oasdiff
+        uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809
+        with:
+          path: ~/oasdiff
+          key: oasdiff-${{ runner.os }}
+
       # Install oasdiff: https://github.com/oasdiff/oasdiff, a tool for detecting breaking changes in OpenAPI specs.
       - name: Install oasdiff
+        if: steps.cache-oasdiff.outputs.cache-hit != 'true'
         run: |
           curl -fsSL https://raw.githubusercontent.com/oasdiff/oasdiff/main/install.sh | sh
+          cp /usr/local/bin/oasdiff ~/oasdiff
+
+      # Setup cached oasdiff
+      - name: Setup cached oasdiff
+        if: steps.cache-oasdiff.outputs.cache-hit == 'true'
+        run: |
+          sudo cp ~/oasdiff /usr/local/bin/oasdiff
+          sudo chmod +x /usr/local/bin/oasdiff
 
       # Run oasdiff to detect breaking changes in the API specification
       # This step will fail if incompatible changes are detected, preventing breaking changes from being merged

From d31e641d6902dd1f43a3cc034af31a58ac135425 Mon Sep 17 00:00:00 2001
From: Akram Ben Aissi <akram.benaissi@gmail.com>
Date: Fri, 12 Sep 2025 10:10:59 +0100
Subject: [PATCH 23/30] fix: Improve pre-commit workflow error handling and
 feedback (#3400)

# What does this PR do?
<!-- Provide a short summary of what this PR does and why. Link to
relevant issues if applicable. -->
fix: Improve pre-commit workflow error handling and feedback

- Add explicit step to check pre-commit results and provide clear error
messages
- Improve verification steps with better error messages and file
listings
- Use GitHub Actions annotations (::error:: and ::warning::) for better
visibility
- Maintain continue-on-error for pre-commit step but add proper failure
handling

This addresses the issue where pre-commit failures were silent but still
caused workflow failures later, making it difficult to understand what
needed to be fixed.


<!-- If resolving an issue, uncomment and update the line below -->
<!-- Closes #[issue-number] -->

## Test Plan
<!-- Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.* -->

Signed-off-by: Akram Ben Aissi <akram.benaissi@gmail.com>
---
 .github/workflows/pre-commit.yml | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 000208043..b5845be53 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -47,11 +47,21 @@ jobs:
         run: npm ci
         working-directory: llama_stack/ui
 
-      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+      - name: Run pre-commit
+        id: precommit
+        uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+        continue-on-error: true
         env:
           SKIP: no-commit-to-branch
           RUFF_OUTPUT_FORMAT: github
 
+      - name: Check pre-commit results
+        if: steps.precommit.outcome == 'failure'
+        run: |
+          echo "::error::Pre-commit hooks failed. Please run 'pre-commit run --all-files' locally and commit the fixes."
+          echo "::warning::Some pre-commit hooks failed. Check the output above for details."
+          exit 1
+
       - name: Debug
         run: |
           echo "github.ref: ${{ github.ref }}"
@@ -79,17 +89,23 @@ jobs:
             echo "No changes to commit"
           fi
 
-      - name: Verify if there are any diff files after pre-commit
+      - name: Verify no uncommitted changes
         if: github.actor != 'dependabot[bot]'
         run: |
-          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
+          if ! git diff --exit-code; then
+            echo "::error::There are uncommitted changes after pre-commit. Please run 'pre-commit run --all-files' locally and commit the fixes."
+            echo "::warning::Files with changes:"
+            git diff --name-status
+            exit 1
+          fi
 
       - name: Verify if there are any new files after pre-commit
         if: github.actor != 'dependabot[bot]'
         run: |
           unstaged_files=$(git ls-files --others --exclude-standard)
           if [ -n "$unstaged_files" ]; then
-            echo "There are uncommitted new files, run pre-commit locally and commit again"
+            echo "::error::There are new untracked files after pre-commit. Please run 'pre-commit run --all-files' locally and commit the fixes."
+            echo "::warning::New files:"
             echo "$unstaged_files"
             exit 1
           fi

From f67081d2d6088a0d3175baffad94977ddf8f6483 Mon Sep 17 00:00:00 2001
From: Doug Edgar <dedgar@redhat.com>
Date: Fri, 12 Sep 2025 02:18:19 -0700
Subject: [PATCH 24/30] feat: migrate to FIPS-validated cryptographic
 algorithms (#3423)

# What does this PR do?
Migrates MD5 and SHA-1 hash algorithms to SHA-256.

In particular, replaces:
   - MD5 in chunk ID generation.
   - MD5 in file verification.
   - SHA-1 in model identifier digests.

And updates all related test expectations.

Original discussion:
https://github.com/llamastack/llama-stack/discussions/3413

<!-- If resolving an issue, uncomment and update the line below -->
Closes #3424.

## Test Plan
Unit tests from scripts/unit-tests.sh were updated to match the new hash
output, and ran to verify the tests pass.

Signed-off-by: Doug Edgar <dedgar@redhat.com>
---
 llama_stack/cli/verify_download.py              | 17 +++++++----------
 .../providers/utils/vector_io/vector_utils.py   |  6 ++----
 llama_stack/testing/inference_recorder.py       |  2 +-
 .../providers/vector_io/test_vector_utils.py    | 12 ++++++------
 4 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/llama_stack/cli/verify_download.py b/llama_stack/cli/verify_download.py
index b7f4cfdb5..e738abb4f 100644
--- a/llama_stack/cli/verify_download.py
+++ b/llama_stack/cli/verify_download.py
@@ -48,15 +48,12 @@ def setup_verify_download_parser(parser: argparse.ArgumentParser) -> None:
     parser.set_defaults(func=partial(run_verify_cmd, parser=parser))
 
 
-def calculate_md5(filepath: Path, chunk_size: int = 8192) -> str:
-    # NOTE: MD5 is used here only for download integrity verification,
-    # not for security purposes
-    # TODO: switch to SHA256
-    md5_hash = hashlib.md5(usedforsecurity=False)
+def calculate_sha256(filepath: Path, chunk_size: int = 8192) -> str:
+    sha256_hash = hashlib.sha256()
     with open(filepath, "rb") as f:
         for chunk in iter(lambda: f.read(chunk_size), b""):
-            md5_hash.update(chunk)
-    return md5_hash.hexdigest()
+            sha256_hash.update(chunk)
+    return sha256_hash.hexdigest()
 
 
 def load_checksums(checklist_path: Path) -> dict[str, str]:
@@ -64,10 +61,10 @@ def load_checksums(checklist_path: Path) -> dict[str, str]:
     with open(checklist_path) as f:
         for line in f:
             if line.strip():
-                md5sum, filepath = line.strip().split("  ", 1)
+                sha256sum, filepath = line.strip().split("  ", 1)
                 # Remove leading './' if present
                 filepath = filepath.lstrip("./")
-                checksums[filepath] = md5sum
+                checksums[filepath] = sha256sum
     return checksums
 
 
@@ -88,7 +85,7 @@ def verify_files(model_dir: Path, checksums: dict[str, str], console: Console) -
             matches = False
 
             if exists:
-                actual_hash = calculate_md5(full_path)
+                actual_hash = calculate_sha256(full_path)
                 matches = actual_hash == expected_hash
 
             results.append(
diff --git a/llama_stack/providers/utils/vector_io/vector_utils.py b/llama_stack/providers/utils/vector_io/vector_utils.py
index e55ac75ae..324f35405 100644
--- a/llama_stack/providers/utils/vector_io/vector_utils.py
+++ b/llama_stack/providers/utils/vector_io/vector_utils.py
@@ -12,14 +12,12 @@ import uuid
 def generate_chunk_id(document_id: str, chunk_text: str, chunk_window: str | None = None) -> str:
     """
     Generate a unique chunk ID using a hash of the document ID and chunk text.
-
-    Note: MD5 is used only to calculate an identifier, not for security purposes.
-    Adding usedforsecurity=False for compatibility with FIPS environments.
+    Then use the first 32 characters of the hash to create a UUID.
     """
     hash_input = f"{document_id}:{chunk_text}".encode()
     if chunk_window:
         hash_input += f":{chunk_window}".encode()
-    return str(uuid.UUID(hashlib.md5(hash_input, usedforsecurity=False).hexdigest()))
+    return str(uuid.UUID(hashlib.sha256(hash_input).hexdigest()[:32]))
 
 
 def proper_case(s: str) -> str:
diff --git a/llama_stack/testing/inference_recorder.py b/llama_stack/testing/inference_recorder.py
index e78f493a6..6f017c51d 100644
--- a/llama_stack/testing/inference_recorder.py
+++ b/llama_stack/testing/inference_recorder.py
@@ -211,7 +211,7 @@ def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str:
         return sorted(set(idents))
 
     identifiers = _extract_model_identifiers()
-    return hashlib.sha1(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8]
+    return hashlib.sha256(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8]
 
 
 def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None:
diff --git a/tests/unit/providers/vector_io/test_vector_utils.py b/tests/unit/providers/vector_io/test_vector_utils.py
index a5d803a82..10ebe5bfb 100644
--- a/tests/unit/providers/vector_io/test_vector_utils.py
+++ b/tests/unit/providers/vector_io/test_vector_utils.py
@@ -26,9 +26,9 @@ def test_generate_chunk_id():
 
     chunk_ids = sorted([chunk.chunk_id for chunk in chunks])
     assert chunk_ids == [
-        "177a1368-f6a8-0c50-6e92-18677f2c3de3",
-        "bc744db3-1b25-0a9c-cdff-b6ba3df73c36",
-        "f68df25d-d9aa-ab4d-5684-64a233add20d",
+        "31d1f9a3-c8d2-66e7-3c37-af2acd329778",
+        "d07dade7-29c0-cda7-df29-0249a1dcbc3e",
+        "d14f75a1-5855-7f72-2c78-d9fc4275a346",
     ]
 
 
@@ -36,14 +36,14 @@ def test_generate_chunk_id_with_window():
     chunk = Chunk(content="test", metadata={"document_id": "doc-1"})
     chunk_id1 = generate_chunk_id("doc-1", chunk, chunk_window="0-1")
     chunk_id2 = generate_chunk_id("doc-1", chunk, chunk_window="1-2")
-    assert chunk_id1 == "149018fe-d0eb-0f8d-5f7f-726bdd2aeedb"
-    assert chunk_id2 == "4562c1ee-9971-1f3b-51a6-7d05e5211154"
+    assert chunk_id1 == "8630321a-d9cb-2bb6-cd28-ebf68dafd866"
+    assert chunk_id2 == "13a1c09a-cbda-b61a-2d1a-7baa90888685"
 
 
 def test_chunk_id():
     # Test with existing chunk ID
     chunk_with_id = Chunk(content="test", metadata={"document_id": "existing-id"})
-    assert chunk_with_id.chunk_id == "84ededcc-b80b-a83e-1a20-ca6515a11350"
+    assert chunk_with_id.chunk_id == "11704f92-42b6-61df-bf85-6473e7708fbd"
 
     # Test with document ID in metadata
     chunk_with_doc_id = Chunk(content="test", metadata={"document_id": "doc-1"})

From 3de9ad0a87d7bfad50ab23c859cebcaf06b6911b Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Fri, 12 Sep 2025 17:59:56 -0400
Subject: [PATCH 25/30] chore(recorder, tests): add test for openai /v1/models
 (#3426)

# What does this PR do?

- [x] adds a test for the recorder's handling of /v1/models
- [x] adds a fix for /v1/models handling

## Test Plan

ci
---
 llama_stack/testing/inference_recorder.py     | 60 ++++++++++---------
 .../distribution/test_inference_recordings.py | 51 ++++++++++++++--
 2 files changed, 79 insertions(+), 32 deletions(-)

diff --git a/llama_stack/testing/inference_recorder.py b/llama_stack/testing/inference_recorder.py
index 6f017c51d..745160976 100644
--- a/llama_stack/testing/inference_recorder.py
+++ b/llama_stack/testing/inference_recorder.py
@@ -7,6 +7,7 @@
 from __future__ import annotations  # for forward references
 
 import hashlib
+import inspect
 import json
 import os
 from collections.abc import Generator
@@ -198,16 +199,11 @@ def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str:
 
         Supported endpoints:
         - '/api/tags' (Ollama): response body has 'models': [ { name/model/digest/id/... }, ... ]
-        - '/v1/models' (OpenAI): response body has 'data': [ { id: ... }, ... ]
+        - '/v1/models' (OpenAI): response body is: [ { id: ... }, ... ]
         Returns a list of unique identifiers or None if structure doesn't match.
         """
-        body = response["body"]
-        if endpoint == "/api/tags":
-            items = body.get("models")
-            idents = [m.model for m in items]
-        else:
-            items = body.get("data")
-            idents = [m.id for m in items]
+        items = response["body"]
+        idents = [m.model if endpoint == "/api/tags" else m.id for m in items]
         return sorted(set(idents))
 
     identifiers = _extract_model_identifiers()
@@ -219,28 +215,22 @@ def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]])
     seen: dict[str, dict[str, Any]] = {}
     for rec in records:
         body = rec["response"]["body"]
-        if endpoint == "/api/tags":
-            items = body.models
-        elif endpoint == "/v1/models":
-            items = body.data
-        else:
-            items = []
-
-        for m in items:
-            if endpoint == "/v1/models":
+        if endpoint == "/v1/models":
+            for m in body:
                 key = m.id
-            else:
+                seen[key] = m
+        elif endpoint == "/api/tags":
+            for m in body.models:
                 key = m.model
-            seen[key] = m
+                seen[key] = m
 
     ordered = [seen[k] for k in sorted(seen.keys())]
     canonical = records[0]
     canonical_req = canonical.get("request", {})
     if isinstance(canonical_req, dict):
         canonical_req["endpoint"] = endpoint
-    if endpoint == "/v1/models":
-        body = {"data": ordered, "object": "list"}
-    else:
+    body = ordered
+    if endpoint == "/api/tags":
         from ollama import ListResponse
 
         body = ListResponse(models=ordered)
@@ -252,7 +242,10 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
 
     if _current_mode == InferenceMode.LIVE or _current_storage is None:
         # Normal operation
-        return await original_method(self, *args, **kwargs)
+        if inspect.iscoroutinefunction(original_method):
+            return await original_method(self, *args, **kwargs)
+        else:
+            return original_method(self, *args, **kwargs)
 
     # Get base URL based on client type
     if client_type == "openai":
@@ -300,7 +293,14 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
             )
 
     elif _current_mode == InferenceMode.RECORD:
-        response = await original_method(self, *args, **kwargs)
+        if inspect.iscoroutinefunction(original_method):
+            response = await original_method(self, *args, **kwargs)
+        else:
+            response = original_method(self, *args, **kwargs)
+
+        # we want to store the result of the iterator, not the iterator itself
+        if endpoint == "/v1/models":
+            response = [m async for m in response]
 
         request_data = {
             "method": method,
@@ -380,10 +380,14 @@ def patch_inference_clients():
             _original_methods["embeddings_create"], self, "openai", "/v1/embeddings", *args, **kwargs
         )
 
-    async def patched_models_list(self, *args, **kwargs):
-        return await _patched_inference_method(
-            _original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs
-        )
+    def patched_models_list(self, *args, **kwargs):
+        async def _iter():
+            for item in await _patched_inference_method(
+                _original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs
+            ):
+                yield item
+
+        return _iter()
 
     # Apply OpenAI patches
     AsyncChatCompletions.create = patched_chat_completions_create
diff --git a/tests/unit/distribution/test_inference_recordings.py b/tests/unit/distribution/test_inference_recordings.py
index c69cf319b..94fd2536e 100644
--- a/tests/unit/distribution/test_inference_recordings.py
+++ b/tests/unit/distribution/test_inference_recordings.py
@@ -6,10 +6,11 @@
 
 import tempfile
 from pathlib import Path
-from unittest.mock import patch
+from unittest.mock import AsyncMock, Mock, patch
 
 import pytest
 from openai import AsyncOpenAI
+from openai.types.model import Model as OpenAIModel
 
 # Import the real Pydantic response types instead of using Mocks
 from llama_stack.apis.inference import (
@@ -158,7 +159,9 @@ class TestInferenceRecording:
             return real_openai_chat_response
 
         temp_storage_dir = temp_storage_dir / "test_recording_mode"
-        with patch("openai.resources.chat.completions.AsyncCompletions.create", side_effect=mock_create):
+        with patch(
+            "openai.resources.chat.completions.AsyncCompletions.create", new_callable=AsyncMock, side_effect=mock_create
+        ):
             with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
                 client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
 
@@ -184,7 +187,9 @@ class TestInferenceRecording:
 
         temp_storage_dir = temp_storage_dir / "test_replay_mode"
         # First, record a response
-        with patch("openai.resources.chat.completions.AsyncCompletions.create", side_effect=mock_create):
+        with patch(
+            "openai.resources.chat.completions.AsyncCompletions.create", new_callable=AsyncMock, side_effect=mock_create
+        ):
             with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
                 client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
 
@@ -213,6 +218,42 @@ class TestInferenceRecording:
                 # Verify the original method was NOT called
                 mock_create_patch.assert_not_called()
 
+    async def test_replay_mode_models(self, temp_storage_dir):
+        """Test that replay mode returns stored responses without making real model listing calls."""
+
+        async def _async_iterator(models):
+            for model in models:
+                yield model
+
+        models = [
+            OpenAIModel(id="foo", created=1, object="model", owned_by="test"),
+            OpenAIModel(id="bar", created=2, object="model", owned_by="test"),
+        ]
+
+        expected_ids = {m.id for m in models}
+
+        temp_storage_dir = temp_storage_dir / "test_replay_mode_models"
+
+        # baseline - mock works without recording
+        client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+        client.models._get_api_list = Mock(return_value=_async_iterator(models))
+        assert {m.id async for m in client.models.list()} == expected_ids
+        client.models._get_api_list.assert_called_once()
+
+        # record the call
+        with inference_recording(mode=InferenceMode.RECORD, storage_dir=temp_storage_dir):
+            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+            client.models._get_api_list = Mock(return_value=_async_iterator(models))
+            assert {m.id async for m in client.models.list()} == expected_ids
+            client.models._get_api_list.assert_called_once()
+
+        # replay the call
+        with inference_recording(mode=InferenceMode.REPLAY, storage_dir=temp_storage_dir):
+            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+            client.models._get_api_list = Mock(return_value=_async_iterator(models))
+            assert {m.id async for m in client.models.list()} == expected_ids
+            client.models._get_api_list.assert_not_called()
+
     async def test_replay_missing_recording(self, temp_storage_dir):
         """Test that replay mode fails when no recording is found."""
         temp_storage_dir = temp_storage_dir / "test_replay_missing_recording"
@@ -233,7 +274,9 @@ class TestInferenceRecording:
 
         temp_storage_dir = temp_storage_dir / "test_embeddings_recording"
         # Record
-        with patch("openai.resources.embeddings.AsyncEmbeddings.create", side_effect=mock_create):
+        with patch(
+            "openai.resources.embeddings.AsyncEmbeddings.create", new_callable=AsyncMock, side_effect=mock_create
+        ):
             with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
                 client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
 

From 8cf2128b40195634c4024e3c797eceaaa4da19bc Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Sat, 13 Sep 2025 12:28:04 -0400
Subject: [PATCH 26/30] chore(tests): always show slowest tests (#3431)

# What does this PR do?

help developers identify slow tests by always passing --duration to
pytest


## Test Plan

n/a
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 72c4f6f9e..ce95b758f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -354,6 +354,7 @@ warn_required_dynamic_aliases = true
 classmethod-decorators = ["classmethod", "pydantic.field_validator"]
 
 [tool.pytest.ini_options]
+addopts = ["--durations=10"]
 asyncio_mode = "auto"
 markers = [
     "allow_network: Allow network access for specific unit tests",

From 6787755c0c8af6b59322352f985cffb224aadd3b Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Sat, 13 Sep 2025 14:11:38 -0400
Subject: [PATCH 27/30] chore(recorder): add support for NOT_GIVEN (#3430)

# What does this PR do?

the recorder mocks the openai-python interface. the openai-python
interface allows NOT_GIVEN as an input option. this change properly
handles NOT_GIVEN.


## Test Plan

ci (coverage for chat, completions, embeddings)
---
 llama_stack/testing/inference_recorder.py     |  5 ++
 .../distribution/test_inference_recordings.py | 65 ++++++++++++++++++-
 2 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/llama_stack/testing/inference_recorder.py b/llama_stack/testing/inference_recorder.py
index 745160976..f899d73d3 100644
--- a/llama_stack/testing/inference_recorder.py
+++ b/llama_stack/testing/inference_recorder.py
@@ -16,6 +16,8 @@ from enum import StrEnum
 from pathlib import Path
 from typing import Any, Literal, cast
 
+from openai import NOT_GIVEN
+
 from llama_stack.log import get_logger
 
 logger = get_logger(__name__, category="testing")
@@ -250,6 +252,9 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
     # Get base URL based on client type
     if client_type == "openai":
         base_url = str(self._client.base_url)
+
+        # the OpenAI client methods may pass NOT_GIVEN for unset parameters; filter these out
+        kwargs = {k: v for k, v in kwargs.items() if v is not NOT_GIVEN}
     elif client_type == "ollama":
         # Get base URL from the client (Ollama client uses host attribute)
         base_url = getattr(self, "host", "http://localhost:11434")
diff --git a/tests/unit/distribution/test_inference_recordings.py b/tests/unit/distribution/test_inference_recordings.py
index 94fd2536e..4909bbe1e 100644
--- a/tests/unit/distribution/test_inference_recordings.py
+++ b/tests/unit/distribution/test_inference_recordings.py
@@ -9,7 +9,7 @@ from pathlib import Path
 from unittest.mock import AsyncMock, Mock, patch
 
 import pytest
-from openai import AsyncOpenAI
+from openai import NOT_GIVEN, AsyncOpenAI
 from openai.types.model import Model as OpenAIModel
 
 # Import the real Pydantic response types instead of using Mocks
@@ -17,6 +17,7 @@ from llama_stack.apis.inference import (
     OpenAIAssistantMessageParam,
     OpenAIChatCompletion,
     OpenAIChoice,
+    OpenAICompletion,
     OpenAIEmbeddingData,
     OpenAIEmbeddingsResponse,
     OpenAIEmbeddingUsage,
@@ -170,6 +171,7 @@ class TestInferenceRecording:
                     messages=[{"role": "user", "content": "Hello, how are you?"}],
                     temperature=0.7,
                     max_tokens=50,
+                    user=NOT_GIVEN,
                 )
 
                 # Verify the response was returned correctly
@@ -198,6 +200,7 @@ class TestInferenceRecording:
                     messages=[{"role": "user", "content": "Hello, how are you?"}],
                     temperature=0.7,
                     max_tokens=50,
+                    user=NOT_GIVEN,
                 )
 
         # Now test replay mode - should not call the original method
@@ -281,7 +284,11 @@ class TestInferenceRecording:
                 client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
 
                 response = await client.embeddings.create(
-                    model="nomic-embed-text", input=["Hello world", "Test embedding"]
+                    model=real_embeddings_response.model,
+                    input=["Hello world", "Test embedding"],
+                    encoding_format=NOT_GIVEN,
+                    dimensions=NOT_GIVEN,
+                    user=NOT_GIVEN,
                 )
 
                 assert len(response.data) == 2
@@ -292,7 +299,8 @@ class TestInferenceRecording:
                 client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
 
                 response = await client.embeddings.create(
-                    model="nomic-embed-text", input=["Hello world", "Test embedding"]
+                    model=real_embeddings_response.model,
+                    input=["Hello world", "Test embedding"],
                 )
 
                 # Verify we got the recorded response
@@ -302,6 +310,57 @@ class TestInferenceRecording:
                 # Verify original method was not called
                 mock_create_patch.assert_not_called()
 
+    async def test_completions_recording(self, temp_storage_dir):
+        real_completions_response = OpenAICompletion(
+            id="test_completion",
+            object="text_completion",
+            created=1234567890,
+            model="llama3.2:3b",
+            choices=[
+                {
+                    "text": "Hello! I'm doing well, thank you for asking.",
+                    "index": 0,
+                    "logprobs": None,
+                    "finish_reason": "stop",
+                }
+            ],
+        )
+
+        async def mock_create(*args, **kwargs):
+            return real_completions_response
+
+        temp_storage_dir = temp_storage_dir / "test_completions_recording"
+
+        # Record
+        with patch(
+            "openai.resources.completions.AsyncCompletions.create", new_callable=AsyncMock, side_effect=mock_create
+        ):
+            with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
+                client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+
+                response = await client.completions.create(
+                    model=real_completions_response.model,
+                    prompt="Hello, how are you?",
+                    temperature=0.7,
+                    max_tokens=50,
+                    user=NOT_GIVEN,
+                )
+
+                assert response.choices[0].text == real_completions_response.choices[0].text
+
+        # Replay
+        with patch("openai.resources.completions.AsyncCompletions.create") as mock_create_patch:
+            with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)):
+                client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+                response = await client.completions.create(
+                    model=real_completions_response.model,
+                    prompt="Hello, how are you?",
+                    temperature=0.7,
+                    max_tokens=50,
+                )
+                assert response.choices[0].text == real_completions_response.choices[0].text
+                mock_create_patch.assert_not_called()
+
     async def test_live_mode(self, real_openai_chat_response):
         """Test that live mode passes through to original methods."""
 

From 36fd97e306d14cbb5eba7c18ce93dcb05bdf9206 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 15 Sep 2025 09:46:05 +0200
Subject: [PATCH 28/30] chore(ui-deps): bump next from 15.3.3 to 15.5.3 in
 /llama_stack/ui (#3438)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [next](https://github.com/vercel/next.js) from 15.3.3 to 15.5.3.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/vercel/next.js/releases">next's
releases</a>.</em></p>
<blockquote>
<h2>v15.5.3</h2>
<blockquote>
<p>[!NOTE]<br />
This release is backporting bug fixes. It does <strong>not</strong>
include all pending features/changes on canary.</p>
</blockquote>
<h3>Core Changes</h3>
<ul>
<li>fix: validation return types of pages API routes (<a
href="https://redirect.github.com/vercel/next.js/issues/83069">#83069</a>)</li>
<li>fix: relative paths in dev in validator.ts (<a
href="https://redirect.github.com/vercel/next.js/issues/83073">#83073</a>)</li>
<li>fix: remove satisfies keyword from type validation to preserve old
TS compatibility (<a
href="https://redirect.github.com/vercel/next.js/issues/83071">#83071</a>)</li>
</ul>
<h3>Credits</h3>
<p>Huge thanks to <a
href="https://github.com/bgub"><code>@​bgub</code></a> for helping!</p>
<h2>v15.5.2</h2>
<blockquote>
<p>[!NOTE]<br />
This release is backporting bug fixes. It does <strong>not</strong>
include all pending features/changes on canary.</p>
</blockquote>
<h3>Core Changes</h3>
<ul>
<li>fix: disable unknownatrules lint rule entirely (<a
href="https://redirect.github.com/vercel/next.js/issues/83059">#83059</a>)</li>
<li>revert: add ?dpl to fonts in /_next/static/media (<a
href="https://redirect.github.com/vercel/next.js/issues/83062">#83062</a>)</li>
</ul>
<h3>Credits</h3>
<p>Huge thanks to <a
href="https://github.com/bgub"><code>@​bgub</code></a> and <a
href="https://github.com/ztanner"><code>@​ztanner</code></a> for
helping!</p>
<h2>v15.5.1</h2>
<blockquote>
<p>[!NOTE]<br />
This release is backporting bug fixes. It does <strong>not</strong>
include all pending features/changes on canary.</p>
</blockquote>
<h3>Core Changes</h3>
<ul>
<li>fix: aliased navigations should apply scroll handling (<a
href="https://redirect.github.com/vercel/next.js/issues/82900">#82900</a>)</li>
<li>Turbopack: fix invalid NFT entry with file behind symlink (<a
href="https://redirect.github.com/vercel/next.js/issues/82887">#82887</a>)</li>
<li>fix: typesafe linking to route handlers and pages API routes (<a
href="https://redirect.github.com/vercel/next.js/issues/82858">#82858</a>)</li>
<li>fix: change &quot;noUnknownAtRules&quot; to &quot;warn&quot; for
Biome (<a
href="https://redirect.github.com/vercel/next.js/issues/82974">#82974</a>)</li>
<li>fix: add path normalization to getRelativePath for Windows (<a
href="https://redirect.github.com/vercel/next.js/issues/82918">#82918</a>)</li>
<li>feat: add typesafety with config.typedRoutes to redirect() and
permanentRedirect() (<a
href="https://redirect.github.com/vercel/next.js/issues/82860">#82860</a>)</li>
<li>fix: avoid importing types that will be unused (<a
href="https://redirect.github.com/vercel/next.js/issues/82856">#82856</a>)</li>
<li>fix: update the config.api.responseLimit type (<a
href="https://redirect.github.com/vercel/next.js/issues/82852">#82852</a>)</li>
<li>fix: update validation return types (<a
href="https://redirect.github.com/vercel/next.js/issues/82854">#82854</a>)</li>
</ul>
<h3>Credits</h3>
<p>Huge thanks to <a
href="https://github.com/bgub"><code>@​bgub</code></a>, <a
href="https://github.com/mischnic"><code>@​mischnic</code></a>, and <a
href="https://github.com/ztanner"><code>@​ztanner</code></a> for
helping!</p>
<h2>v15.5.1-canary.39</h2>
<h3>Core Changes</h3>
<ul>
<li>[metadata] change the metadata routes params to promises: <a
href="https://redirect.github.com/vercel/next.js/issues/83560">#83560</a></li>
</ul>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/vercel/next.js/commit/07d1cbc9c6393b5e7972edc7c0e33587b79f9943"><code>07d1cbc</code></a>
v15.5.3</li>
<li><a
href="https://github.com/vercel/next.js/commit/db56d7759546c0447e9435c36c0b94e19d59409a"><code>db56d77</code></a>
[backport] fix: validation return types of pages API routes (<a
href="https://redirect.github.com/vercel/next.js/issues/83069">#83069</a>)
(<a
href="https://redirect.github.com/vercel/next.js/issues/83580">#83580</a>)</li>
<li><a
href="https://github.com/vercel/next.js/commit/7a806231f85a370a81f47170f0c426240fd58c8e"><code>7a80623</code></a>
[backport] fix: relative paths in dev in validator.ts (<a
href="https://redirect.github.com/vercel/next.js/issues/83073">#83073</a>)
(<a
href="https://redirect.github.com/vercel/next.js/issues/83190">#83190</a>)</li>
<li><a
href="https://github.com/vercel/next.js/commit/fddaeb85a0ca57fc9ae89dea4f987eb4f432e8a2"><code>fddaeb8</code></a>
[backport] fix: remove <code>satisfies</code> keyword from type
validation to preserve o...</li>
<li><a
href="https://github.com/vercel/next.js/commit/497ec6aa08a33f9e2d65a5c8461f550c2549d3e6"><code>497ec6a</code></a>
v15.5.2</li>
<li><a
href="https://github.com/vercel/next.js/commit/bc72f41a2e66c16b8d8237c9e9020dcda9c5467f"><code>bc72f41</code></a>
[backport] revert: add ?dpl to fonts in <code>/_next/static/media</code>
(<a
href="https://redirect.github.com/vercel/next.js/issues/83062">#83062</a>)
(<a
href="https://redirect.github.com/vercel/next.js/issues/83066">#83066</a>)</li>
<li><a
href="https://github.com/vercel/next.js/commit/c8faf6800b1e4e01807642d288b5894b3481ec5f"><code>c8faf68</code></a>
[backport] fix: disable unknownatrules lint rule entirely (<a
href="https://redirect.github.com/vercel/next.js/issues/83059">#83059</a>)
(<a
href="https://redirect.github.com/vercel/next.js/issues/83060">#83060</a>)</li>
<li><a
href="https://github.com/vercel/next.js/commit/cc68ced55210aca1716daabefb5aa2006bc3d024"><code>cc68ced</code></a>
v15.5.1</li>
<li><a
href="https://github.com/vercel/next.js/commit/1ce9857276d1e348776dc61837692ee85a5401a7"><code>1ce9857</code></a>
[backport] fix: update validation return types (<a
href="https://redirect.github.com/vercel/next.js/issues/82854">#82854</a>)
(<a
href="https://redirect.github.com/vercel/next.js/issues/83027">#83027</a>)</li>
<li><a
href="https://github.com/vercel/next.js/commit/b93c89471755ba10e09ab0064c697c5ee35054d5"><code>b93c894</code></a>
[backport] fix: update the config.api.responseLimit type (<a
href="https://redirect.github.com/vercel/next.js/issues/82852">#82852</a>)
(<a
href="https://redirect.github.com/vercel/next.js/issues/83028">#83028</a>)</li>
<li>Additional commits viewable in <a
href="https://github.com/vercel/next.js/compare/v15.3.3...v15.5.3">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=next&package-manager=npm_and_yarn&previous-version=15.3.3&new-version=15.5.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 llama_stack/ui/package-lock.json | 370 +++++++++++++++++--------------
 llama_stack/ui/package.json      |   2 +-
 2 files changed, 199 insertions(+), 173 deletions(-)

diff --git a/llama_stack/ui/package-lock.json b/llama_stack/ui/package-lock.json
index e2c0815fd..ff73fa2e8 100644
--- a/llama_stack/ui/package-lock.json
+++ b/llama_stack/ui/package-lock.json
@@ -20,7 +20,7 @@
         "framer-motion": "^12.23.12",
         "llama-stack-client": "^0.2.21",
         "lucide-react": "^0.542.0",
-        "next": "15.3.3",
+        "next": "15.5.3",
         "next-auth": "^4.24.11",
         "next-themes": "^0.4.6",
         "react": "^19.0.0",
@@ -664,9 +664,9 @@
       }
     },
     "node_modules/@emnapi/runtime": {
-      "version": "1.4.3",
-      "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.4.3.tgz",
-      "integrity": "sha512-pBPWdu6MLKROBX05wSNKcNb++m5Er+KQ9QkB+WVM+pW2Kx9hoSrVTnu3BdkI5eBLZoKu/J6mW/B6i6bJB2ytXQ==",
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.5.0.tgz",
+      "integrity": "sha512-97/BJ3iXHww3djw6hYIfErCZFee7qCtrneuLa20UXFCOTCfBM2cvQHjWJ2EG0s0MtdNwInarqCTz35i4wWXHsQ==",
       "license": "MIT",
       "optional": true,
       "dependencies": {
@@ -927,9 +927,9 @@
       }
     },
     "node_modules/@img/sharp-darwin-arm64": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.1.tgz",
-      "integrity": "sha512-pn44xgBtgpEbZsu+lWf2KNb6OAf70X68k+yk69Ic2Xz11zHR/w24/U49XT7AeRwJ0Px+mhALhU5LPci1Aymk7A==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.3.tgz",
+      "integrity": "sha512-ryFMfvxxpQRsgZJqBd4wsttYQbCxsJksrv9Lw/v798JcQ8+w84mBWuXwl+TT0WJ/WrYOLaYpwQXi3sA9nTIaIg==",
       "cpu": [
         "arm64"
       ],
@@ -945,13 +945,13 @@
         "url": "https://opencollective.com/libvips"
       },
       "optionalDependencies": {
-        "@img/sharp-libvips-darwin-arm64": "1.1.0"
+        "@img/sharp-libvips-darwin-arm64": "1.2.0"
       }
     },
     "node_modules/@img/sharp-darwin-x64": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.1.tgz",
-      "integrity": "sha512-VfuYgG2r8BpYiOUN+BfYeFo69nP/MIwAtSJ7/Zpxc5QF3KS22z8Pvg3FkrSFJBPNQ7mmcUcYQFBmEQp7eu1F8Q==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.3.tgz",
+      "integrity": "sha512-yHpJYynROAj12TA6qil58hmPmAwxKKC7reUqtGLzsOHfP7/rniNGTL8tjWX6L3CTV4+5P4ypcS7Pp+7OB+8ihA==",
       "cpu": [
         "x64"
       ],
@@ -967,13 +967,13 @@
         "url": "https://opencollective.com/libvips"
       },
       "optionalDependencies": {
-        "@img/sharp-libvips-darwin-x64": "1.1.0"
+        "@img/sharp-libvips-darwin-x64": "1.2.0"
       }
     },
     "node_modules/@img/sharp-libvips-darwin-arm64": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.1.0.tgz",
-      "integrity": "sha512-HZ/JUmPwrJSoM4DIQPv/BfNh9yrOA8tlBbqbLz4JZ5uew2+o22Ik+tHQJcih7QJuSa0zo5coHTfD5J8inqj9DA==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.2.0.tgz",
+      "integrity": "sha512-sBZmpwmxqwlqG9ueWFXtockhsxefaV6O84BMOrhtg/YqbTaRdqDE7hxraVE3y6gVM4eExmfzW4a8el9ArLeEiQ==",
       "cpu": [
         "arm64"
       ],
@@ -987,9 +987,9 @@
       }
     },
     "node_modules/@img/sharp-libvips-darwin-x64": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.1.0.tgz",
-      "integrity": "sha512-Xzc2ToEmHN+hfvsl9wja0RlnXEgpKNmftriQp6XzY/RaSfwD9th+MSh0WQKzUreLKKINb3afirxW7A0fz2YWuQ==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.2.0.tgz",
+      "integrity": "sha512-M64XVuL94OgiNHa5/m2YvEQI5q2cl9d/wk0qFTDVXcYzi43lxuiFTftMR1tOnFQovVXNZJ5TURSDK2pNe9Yzqg==",
       "cpu": [
         "x64"
       ],
@@ -1003,9 +1003,9 @@
       }
     },
     "node_modules/@img/sharp-libvips-linux-arm": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.1.0.tgz",
-      "integrity": "sha512-s8BAd0lwUIvYCJyRdFqvsj+BJIpDBSxs6ivrOPm/R7piTs5UIwY5OjXrP2bqXC9/moGsyRa37eYWYCOGVXxVrA==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.2.0.tgz",
+      "integrity": "sha512-mWd2uWvDtL/nvIzThLq3fr2nnGfyr/XMXlq8ZJ9WMR6PXijHlC3ksp0IpuhK6bougvQrchUAfzRLnbsen0Cqvw==",
       "cpu": [
         "arm"
       ],
@@ -1019,9 +1019,9 @@
       }
     },
     "node_modules/@img/sharp-libvips-linux-arm64": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.1.0.tgz",
-      "integrity": "sha512-IVfGJa7gjChDET1dK9SekxFFdflarnUB8PwW8aGwEoF3oAsSDuNUTYS+SKDOyOJxQyDC1aPFMuRYLoDInyV9Ew==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.2.0.tgz",
+      "integrity": "sha512-RXwd0CgG+uPRX5YYrkzKyalt2OJYRiJQ8ED/fi1tq9WQW2jsQIn0tqrlR5l5dr/rjqq6AHAxURhj2DVjyQWSOA==",
       "cpu": [
         "arm64"
       ],
@@ -1035,9 +1035,9 @@
       }
     },
     "node_modules/@img/sharp-libvips-linux-ppc64": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-ppc64/-/sharp-libvips-linux-ppc64-1.1.0.tgz",
-      "integrity": "sha512-tiXxFZFbhnkWE2LA8oQj7KYR+bWBkiV2nilRldT7bqoEZ4HiDOcePr9wVDAZPi/Id5fT1oY9iGnDq20cwUz8lQ==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-ppc64/-/sharp-libvips-linux-ppc64-1.2.0.tgz",
+      "integrity": "sha512-Xod/7KaDDHkYu2phxxfeEPXfVXFKx70EAFZ0qyUdOjCcxbjqyJOEUpDe6RIyaunGxT34Anf9ue/wuWOqBW2WcQ==",
       "cpu": [
         "ppc64"
       ],
@@ -1051,9 +1051,9 @@
       }
     },
     "node_modules/@img/sharp-libvips-linux-s390x": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.1.0.tgz",
-      "integrity": "sha512-xukSwvhguw7COyzvmjydRb3x/09+21HykyapcZchiCUkTThEQEOMtBj9UhkaBRLuBrgLFzQ2wbxdeCCJW/jgJA==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.2.0.tgz",
+      "integrity": "sha512-eMKfzDxLGT8mnmPJTNMcjfO33fLiTDsrMlUVcp6b96ETbnJmd4uvZxVJSKPQfS+odwfVaGifhsB07J1LynFehw==",
       "cpu": [
         "s390x"
       ],
@@ -1067,9 +1067,9 @@
       }
     },
     "node_modules/@img/sharp-libvips-linux-x64": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.1.0.tgz",
-      "integrity": "sha512-yRj2+reB8iMg9W5sULM3S74jVS7zqSzHG3Ol/twnAAkAhnGQnpjj6e4ayUz7V+FpKypwgs82xbRdYtchTTUB+Q==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.2.0.tgz",
+      "integrity": "sha512-ZW3FPWIc7K1sH9E3nxIGB3y3dZkpJlMnkk7z5tu1nSkBoCgw2nSRTFHI5pB/3CQaJM0pdzMF3paf9ckKMSE9Tg==",
       "cpu": [
         "x64"
       ],
@@ -1083,9 +1083,9 @@
       }
     },
     "node_modules/@img/sharp-libvips-linuxmusl-arm64": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.1.0.tgz",
-      "integrity": "sha512-jYZdG+whg0MDK+q2COKbYidaqW/WTz0cc1E+tMAusiDygrM4ypmSCjOJPmFTvHHJ8j/6cAGyeDWZOsK06tP33w==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.2.0.tgz",
+      "integrity": "sha512-UG+LqQJbf5VJ8NWJ5Z3tdIe/HXjuIdo4JeVNADXBFuG7z9zjoegpzzGIyV5zQKi4zaJjnAd2+g2nna8TZvuW9Q==",
       "cpu": [
         "arm64"
       ],
@@ -1099,9 +1099,9 @@
       }
     },
     "node_modules/@img/sharp-libvips-linuxmusl-x64": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.1.0.tgz",
-      "integrity": "sha512-wK7SBdwrAiycjXdkPnGCPLjYb9lD4l6Ze2gSdAGVZrEL05AOUJESWU2lhlC+Ffn5/G+VKuSm6zzbQSzFX/P65A==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.2.0.tgz",
+      "integrity": "sha512-SRYOLR7CXPgNze8akZwjoGBoN1ThNZoqpOgfnOxmWsklTGVfJiGJoC/Lod7aNMGA1jSsKWM1+HRX43OP6p9+6Q==",
       "cpu": [
         "x64"
       ],
@@ -1115,9 +1115,9 @@
       }
     },
     "node_modules/@img/sharp-linux-arm": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.1.tgz",
-      "integrity": "sha512-anKiszvACti2sGy9CirTlNyk7BjjZPiML1jt2ZkTdcvpLU1YH6CXwRAZCA2UmRXnhiIftXQ7+Oh62Ji25W72jA==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.3.tgz",
+      "integrity": "sha512-oBK9l+h6KBN0i3dC8rYntLiVfW8D8wH+NPNT3O/WBHeW0OQWCjfWksLUaPidsrDKpJgXp3G3/hkmhptAW0I3+A==",
       "cpu": [
         "arm"
       ],
@@ -1133,13 +1133,13 @@
         "url": "https://opencollective.com/libvips"
       },
       "optionalDependencies": {
-        "@img/sharp-libvips-linux-arm": "1.1.0"
+        "@img/sharp-libvips-linux-arm": "1.2.0"
       }
     },
     "node_modules/@img/sharp-linux-arm64": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.1.tgz",
-      "integrity": "sha512-kX2c+vbvaXC6vly1RDf/IWNXxrlxLNpBVWkdpRq5Ka7OOKj6nr66etKy2IENf6FtOgklkg9ZdGpEu9kwdlcwOQ==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.3.tgz",
+      "integrity": "sha512-QdrKe3EvQrqwkDrtuTIjI0bu6YEJHTgEeqdzI3uWJOH6G1O8Nl1iEeVYRGdj1h5I21CqxSvQp1Yv7xeU3ZewbA==",
       "cpu": [
         "arm64"
       ],
@@ -1155,13 +1155,35 @@
         "url": "https://opencollective.com/libvips"
       },
       "optionalDependencies": {
-        "@img/sharp-libvips-linux-arm64": "1.1.0"
+        "@img/sharp-libvips-linux-arm64": "1.2.0"
+      }
+    },
+    "node_modules/@img/sharp-linux-ppc64": {
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linux-ppc64/-/sharp-linux-ppc64-0.34.3.tgz",
+      "integrity": "sha512-GLtbLQMCNC5nxuImPR2+RgrviwKwVql28FWZIW1zWruy6zLgA5/x2ZXk3mxj58X/tszVF69KK0Is83V8YgWhLA==",
+      "cpu": [
+        "ppc64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": "^18.17.0 || ^20.3.0 || >=21.0.0"
+      },
+      "funding": {
+        "url": "https://opencollective.com/libvips"
+      },
+      "optionalDependencies": {
+        "@img/sharp-libvips-linux-ppc64": "1.2.0"
       }
     },
     "node_modules/@img/sharp-linux-s390x": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.34.1.tgz",
-      "integrity": "sha512-7s0KX2tI9mZI2buRipKIw2X1ufdTeaRgwmRabt5bi9chYfhur+/C1OXg3TKg/eag1W+6CCWLVmSauV1owmRPxA==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.34.3.tgz",
+      "integrity": "sha512-3gahT+A6c4cdc2edhsLHmIOXMb17ltffJlxR0aC2VPZfwKoTGZec6u5GrFgdR7ciJSsHT27BD3TIuGcuRT0KmQ==",
       "cpu": [
         "s390x"
       ],
@@ -1177,13 +1199,13 @@
         "url": "https://opencollective.com/libvips"
       },
       "optionalDependencies": {
-        "@img/sharp-libvips-linux-s390x": "1.1.0"
+        "@img/sharp-libvips-linux-s390x": "1.2.0"
       }
     },
     "node_modules/@img/sharp-linux-x64": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.1.tgz",
-      "integrity": "sha512-wExv7SH9nmoBW3Wr2gvQopX1k8q2g5V5Iag8Zk6AVENsjwd+3adjwxtp3Dcu2QhOXr8W9NusBU6XcQUohBZ5MA==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.3.tgz",
+      "integrity": "sha512-8kYso8d806ypnSq3/Ly0QEw90V5ZoHh10yH0HnrzOCr6DKAPI6QVHvwleqMkVQ0m+fc7EH8ah0BB0QPuWY6zJQ==",
       "cpu": [
         "x64"
       ],
@@ -1199,13 +1221,13 @@
         "url": "https://opencollective.com/libvips"
       },
       "optionalDependencies": {
-        "@img/sharp-libvips-linux-x64": "1.1.0"
+        "@img/sharp-libvips-linux-x64": "1.2.0"
       }
     },
     "node_modules/@img/sharp-linuxmusl-arm64": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.1.tgz",
-      "integrity": "sha512-DfvyxzHxw4WGdPiTF0SOHnm11Xv4aQexvqhRDAoD00MzHekAj9a/jADXeXYCDFH/DzYruwHbXU7uz+H+nWmSOQ==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.3.tgz",
+      "integrity": "sha512-vAjbHDlr4izEiXM1OTggpCcPg9tn4YriK5vAjowJsHwdBIdx0fYRsURkxLG2RLm9gyBq66gwtWI8Gx0/ov+JKQ==",
       "cpu": [
         "arm64"
       ],
@@ -1221,13 +1243,13 @@
         "url": "https://opencollective.com/libvips"
       },
       "optionalDependencies": {
-        "@img/sharp-libvips-linuxmusl-arm64": "1.1.0"
+        "@img/sharp-libvips-linuxmusl-arm64": "1.2.0"
       }
     },
     "node_modules/@img/sharp-linuxmusl-x64": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.1.tgz",
-      "integrity": "sha512-pax/kTR407vNb9qaSIiWVnQplPcGU8LRIJpDT5o8PdAx5aAA7AS3X9PS8Isw1/WfqgQorPotjrZL3Pqh6C5EBg==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.3.tgz",
+      "integrity": "sha512-gCWUn9547K5bwvOn9l5XGAEjVTTRji4aPTqLzGXHvIr6bIDZKNTA34seMPgM0WmSf+RYBH411VavCejp3PkOeQ==",
       "cpu": [
         "x64"
       ],
@@ -1243,20 +1265,20 @@
         "url": "https://opencollective.com/libvips"
       },
       "optionalDependencies": {
-        "@img/sharp-libvips-linuxmusl-x64": "1.1.0"
+        "@img/sharp-libvips-linuxmusl-x64": "1.2.0"
       }
     },
     "node_modules/@img/sharp-wasm32": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-wasm32/-/sharp-wasm32-0.34.1.tgz",
-      "integrity": "sha512-YDybQnYrLQfEpzGOQe7OKcyLUCML4YOXl428gOOzBgN6Gw0rv8dpsJ7PqTHxBnXnwXr8S1mYFSLSa727tpz0xg==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-wasm32/-/sharp-wasm32-0.34.3.tgz",
+      "integrity": "sha512-+CyRcpagHMGteySaWos8IbnXcHgfDn7pO2fiC2slJxvNq9gDipYBN42/RagzctVRKgxATmfqOSulgZv5e1RdMg==",
       "cpu": [
         "wasm32"
       ],
       "license": "Apache-2.0 AND LGPL-3.0-or-later AND MIT",
       "optional": true,
       "dependencies": {
-        "@emnapi/runtime": "^1.4.0"
+        "@emnapi/runtime": "^1.4.4"
       },
       "engines": {
         "node": "^18.17.0 || ^20.3.0 || >=21.0.0"
@@ -1265,10 +1287,29 @@
         "url": "https://opencollective.com/libvips"
       }
     },
+    "node_modules/@img/sharp-win32-arm64": {
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-win32-arm64/-/sharp-win32-arm64-0.34.3.tgz",
+      "integrity": "sha512-MjnHPnbqMXNC2UgeLJtX4XqoVHHlZNd+nPt1kRPmj63wURegwBhZlApELdtxM2OIZDRv/DFtLcNhVbd1z8GYXQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0 AND LGPL-3.0-or-later",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": "^18.17.0 || ^20.3.0 || >=21.0.0"
+      },
+      "funding": {
+        "url": "https://opencollective.com/libvips"
+      }
+    },
     "node_modules/@img/sharp-win32-ia32": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.34.1.tgz",
-      "integrity": "sha512-WKf/NAZITnonBf3U1LfdjoMgNO5JYRSlhovhRhMxXVdvWYveM4kM3L8m35onYIdh75cOMCo1BexgVQcCDzyoWw==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.34.3.tgz",
+      "integrity": "sha512-xuCdhH44WxuXgOM714hn4amodJMZl3OEvf0GVTm0BEyMeA2to+8HEdRPShH0SLYptJY1uBw+SCFP9WVQi1Q/cw==",
       "cpu": [
         "ia32"
       ],
@@ -1285,9 +1326,9 @@
       }
     },
     "node_modules/@img/sharp-win32-x64": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.1.tgz",
-      "integrity": "sha512-hw1iIAHpNE8q3uMIRCgGOeDoz9KtFNarFLQclLxr/LK1VBkj8nby18RjFvr6aP7USRYAjTZW6yisnBWMX571Tw==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.3.tgz",
+      "integrity": "sha512-OWwz05d++TxzLEv4VnsTz5CmZ6mI6S05sfQGEMrNrQcOEERbX46332IvE7pO/EUiw7jUrrS40z/M7kPyjfl04g==",
       "cpu": [
         "x64"
       ],
@@ -1849,9 +1890,10 @@
       }
     },
     "node_modules/@next/env": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/@next/env/-/env-15.3.3.tgz",
-      "integrity": "sha512-OdiMrzCl2Xi0VTjiQQUK0Xh7bJHnOuET2s+3V+Y40WJBAXrJeGA3f+I8MZJ/YQ3mVGi5XGR1L66oFlgqXhQ4Vw=="
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/@next/env/-/env-15.5.3.tgz",
+      "integrity": "sha512-RSEDTRqyihYXygx/OJXwvVupfr9m04+0vH8vyy0HfZ7keRto6VX9BbEk0J2PUk0VGy6YhklJUSrgForov5F9pw==",
+      "license": "MIT"
     },
     "node_modules/@next/eslint-plugin-next": {
       "version": "15.5.2",
@@ -1864,12 +1906,13 @@
       }
     },
     "node_modules/@next/swc-darwin-arm64": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-15.3.3.tgz",
-      "integrity": "sha512-WRJERLuH+O3oYB4yZNVahSVFmtxRNjNF1I1c34tYMoJb0Pve+7/RaLAJJizyYiFhjYNGHRAE1Ri2Fd23zgDqhg==",
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-15.5.3.tgz",
+      "integrity": "sha512-nzbHQo69+au9wJkGKTU9lP7PXv0d1J5ljFpvb+LnEomLtSbJkbZyEs6sbF3plQmiOB2l9OBtN2tNSvCH1nQ9Jg==",
       "cpu": [
         "arm64"
       ],
+      "license": "MIT",
       "optional": true,
       "os": [
         "darwin"
@@ -1879,12 +1922,13 @@
       }
     },
     "node_modules/@next/swc-darwin-x64": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-15.3.3.tgz",
-      "integrity": "sha512-XHdzH/yBc55lu78k/XwtuFR/ZXUTcflpRXcsu0nKmF45U96jt1tsOZhVrn5YH+paw66zOANpOnFQ9i6/j+UYvw==",
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-15.5.3.tgz",
+      "integrity": "sha512-w83w4SkOOhekJOcA5HBvHyGzgV1W/XvOfpkrxIse4uPWhYTTRwtGEM4v/jiXwNSJvfRvah0H8/uTLBKRXlef8g==",
       "cpu": [
         "x64"
       ],
+      "license": "MIT",
       "optional": true,
       "os": [
         "darwin"
@@ -1894,12 +1938,13 @@
       }
     },
     "node_modules/@next/swc-linux-arm64-gnu": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-15.3.3.tgz",
-      "integrity": "sha512-VZ3sYL2LXB8znNGcjhocikEkag/8xiLgnvQts41tq6i+wql63SMS1Q6N8RVXHw5pEUjiof+II3HkDd7GFcgkzw==",
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-15.5.3.tgz",
+      "integrity": "sha512-+m7pfIs0/yvgVu26ieaKrifV8C8yiLe7jVp9SpcIzg7XmyyNE7toC1fy5IOQozmr6kWl/JONC51osih2RyoXRw==",
       "cpu": [
         "arm64"
       ],
+      "license": "MIT",
       "optional": true,
       "os": [
         "linux"
@@ -1909,12 +1954,13 @@
       }
     },
     "node_modules/@next/swc-linux-arm64-musl": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-15.3.3.tgz",
-      "integrity": "sha512-h6Y1fLU4RWAp1HPNJWDYBQ+e3G7sLckyBXhmH9ajn8l/RSMnhbuPBV/fXmy3muMcVwoJdHL+UtzRzs0nXOf9SA==",
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-15.5.3.tgz",
+      "integrity": "sha512-u3PEIzuguSenoZviZJahNLgCexGFhso5mxWCrrIMdvpZn6lkME5vc/ADZG8UUk5K1uWRy4hqSFECrON6UKQBbQ==",
       "cpu": [
         "arm64"
       ],
+      "license": "MIT",
       "optional": true,
       "os": [
         "linux"
@@ -1924,12 +1970,13 @@
       }
     },
     "node_modules/@next/swc-linux-x64-gnu": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-15.3.3.tgz",
-      "integrity": "sha512-jJ8HRiF3N8Zw6hGlytCj5BiHyG/K+fnTKVDEKvUCyiQ/0r5tgwO7OgaRiOjjRoIx2vwLR+Rz8hQoPrnmFbJdfw==",
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-15.5.3.tgz",
+      "integrity": "sha512-lDtOOScYDZxI2BENN9m0pfVPJDSuUkAD1YXSvlJF0DKwZt0WlA7T7o3wrcEr4Q+iHYGzEaVuZcsIbCps4K27sA==",
       "cpu": [
         "x64"
       ],
+      "license": "MIT",
       "optional": true,
       "os": [
         "linux"
@@ -1939,12 +1986,13 @@
       }
     },
     "node_modules/@next/swc-linux-x64-musl": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-15.3.3.tgz",
-      "integrity": "sha512-HrUcTr4N+RgiiGn3jjeT6Oo208UT/7BuTr7K0mdKRBtTbT4v9zJqCDKO97DUqqoBK1qyzP1RwvrWTvU6EPh/Cw==",
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-15.5.3.tgz",
+      "integrity": "sha512-9vWVUnsx9PrY2NwdVRJ4dUURAQ8Su0sLRPqcCCxtX5zIQUBES12eRVHq6b70bbfaVaxIDGJN2afHui0eDm+cLg==",
       "cpu": [
         "x64"
       ],
+      "license": "MIT",
       "optional": true,
       "os": [
         "linux"
@@ -1954,12 +2002,13 @@
       }
     },
     "node_modules/@next/swc-win32-arm64-msvc": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-15.3.3.tgz",
-      "integrity": "sha512-SxorONgi6K7ZUysMtRF3mIeHC5aA3IQLmKFQzU0OuhuUYwpOBc1ypaLJLP5Bf3M9k53KUUUj4vTPwzGvl/NwlQ==",
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-15.5.3.tgz",
+      "integrity": "sha512-1CU20FZzY9LFQigRi6jM45oJMU3KziA5/sSG+dXeVaTm661snQP6xu3ykGxxwU5sLG3sh14teO/IOEPVsQMRfA==",
       "cpu": [
         "arm64"
       ],
+      "license": "MIT",
       "optional": true,
       "os": [
         "win32"
@@ -1969,12 +2018,13 @@
       }
     },
     "node_modules/@next/swc-win32-x64-msvc": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-15.3.3.tgz",
-      "integrity": "sha512-4QZG6F8enl9/S2+yIiOiju0iCTFd93d8VC1q9LZS4p/Xuk81W2QDjCFeoogmrWWkAD59z8ZxepBQap2dKS5ruw==",
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-15.5.3.tgz",
+      "integrity": "sha512-JMoLAq3n3y5tKXPQwCK5c+6tmwkuFDa2XAxz8Wm4+IVthdBZdZGh+lmiLUHg9f9IDwIQpUjp+ysd6OkYTyZRZw==",
       "cpu": [
         "x64"
       ],
+      "license": "MIT",
       "optional": true,
       "os": [
         "win32"
@@ -3547,12 +3597,6 @@
         "@sinonjs/commons": "^3.0.0"
       }
     },
-    "node_modules/@swc/counter": {
-      "version": "0.1.3",
-      "resolved": "https://registry.npmjs.org/@swc/counter/-/counter-0.1.3.tgz",
-      "integrity": "sha512-e2BR4lsJkkRlKZ/qCHPw9ZaSxc0MVUd7gtbtaB7aMvHeJVYe8sOB8DBZkP2DtISHGSku9sCK6T6cnY0CtXrOCQ==",
-      "license": "Apache-2.0"
-    },
     "node_modules/@swc/helpers": {
       "version": "0.5.15",
       "resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.15.tgz",
@@ -5475,17 +5519,6 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/busboy": {
-      "version": "1.6.0",
-      "resolved": "https://registry.npmjs.org/busboy/-/busboy-1.6.0.tgz",
-      "integrity": "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==",
-      "dependencies": {
-        "streamsearch": "^1.1.0"
-      },
-      "engines": {
-        "node": ">=10.16.0"
-      }
-    },
     "node_modules/bytes": {
       "version": "3.1.2",
       "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
@@ -8295,9 +8328,9 @@
       }
     },
     "node_modules/is-arrayish": {
-      "version": "0.3.2",
-      "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz",
-      "integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==",
+      "version": "0.3.4",
+      "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.4.tgz",
+      "integrity": "sha512-m6UrgzFVUYawGBh1dUsWR5M2Clqic9RVXC/9f8ceNlv2IcO9j9J/z8UoCLPqtsPBFNzEpfR3xftohbfqDx8EQA==",
       "license": "MIT",
       "optional": true
     },
@@ -11542,14 +11575,13 @@
       }
     },
     "node_modules/next": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/next/-/next-15.3.3.tgz",
-      "integrity": "sha512-JqNj29hHNmCLtNvd090SyRbXJiivQ+58XjCcrC50Crb5g5u2zi7Y2YivbsEfzk6AtVI80akdOQbaMZwWB1Hthw==",
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/next/-/next-15.5.3.tgz",
+      "integrity": "sha512-r/liNAx16SQj4D+XH/oI1dlpv9tdKJ6cONYPwwcCC46f2NjpaRWY+EKCzULfgQYV6YKXjHBchff2IZBSlZmJNw==",
+      "license": "MIT",
       "dependencies": {
-        "@next/env": "15.3.3",
-        "@swc/counter": "0.1.3",
+        "@next/env": "15.5.3",
         "@swc/helpers": "0.5.15",
-        "busboy": "1.6.0",
         "caniuse-lite": "^1.0.30001579",
         "postcss": "8.4.31",
         "styled-jsx": "5.1.6"
@@ -11561,19 +11593,19 @@
         "node": "^18.18.0 || ^19.8.0 || >= 20.0.0"
       },
       "optionalDependencies": {
-        "@next/swc-darwin-arm64": "15.3.3",
-        "@next/swc-darwin-x64": "15.3.3",
-        "@next/swc-linux-arm64-gnu": "15.3.3",
-        "@next/swc-linux-arm64-musl": "15.3.3",
-        "@next/swc-linux-x64-gnu": "15.3.3",
-        "@next/swc-linux-x64-musl": "15.3.3",
-        "@next/swc-win32-arm64-msvc": "15.3.3",
-        "@next/swc-win32-x64-msvc": "15.3.3",
-        "sharp": "^0.34.1"
+        "@next/swc-darwin-arm64": "15.5.3",
+        "@next/swc-darwin-x64": "15.5.3",
+        "@next/swc-linux-arm64-gnu": "15.5.3",
+        "@next/swc-linux-arm64-musl": "15.5.3",
+        "@next/swc-linux-x64-gnu": "15.5.3",
+        "@next/swc-linux-x64-musl": "15.5.3",
+        "@next/swc-win32-arm64-msvc": "15.5.3",
+        "@next/swc-win32-x64-msvc": "15.5.3",
+        "sharp": "^0.34.3"
       },
       "peerDependencies": {
         "@opentelemetry/api": "^1.1.0",
-        "@playwright/test": "^1.41.2",
+        "@playwright/test": "^1.51.1",
         "babel-plugin-react-compiler": "*",
         "react": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0",
         "react-dom": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0",
@@ -13240,16 +13272,16 @@
       "license": "ISC"
     },
     "node_modules/sharp": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.1.tgz",
-      "integrity": "sha512-1j0w61+eVxu7DawFJtnfYcvSv6qPFvfTaqzTQ2BLknVhHTwGS8sc63ZBF4rzkWMBVKybo4S5OBtDdZahh2A1xg==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.3.tgz",
+      "integrity": "sha512-eX2IQ6nFohW4DbvHIOLRB3MHFpYqaqvXd3Tp5e/T/dSH83fxaNJQRvDMhASmkNTsNTVF2/OOopzRCt7xokgPfg==",
       "hasInstallScript": true,
       "license": "Apache-2.0",
       "optional": true,
       "dependencies": {
         "color": "^4.2.3",
-        "detect-libc": "^2.0.3",
-        "semver": "^7.7.1"
+        "detect-libc": "^2.0.4",
+        "semver": "^7.7.2"
       },
       "engines": {
         "node": "^18.17.0 || ^20.3.0 || >=21.0.0"
@@ -13258,26 +13290,28 @@
         "url": "https://opencollective.com/libvips"
       },
       "optionalDependencies": {
-        "@img/sharp-darwin-arm64": "0.34.1",
-        "@img/sharp-darwin-x64": "0.34.1",
-        "@img/sharp-libvips-darwin-arm64": "1.1.0",
-        "@img/sharp-libvips-darwin-x64": "1.1.0",
-        "@img/sharp-libvips-linux-arm": "1.1.0",
-        "@img/sharp-libvips-linux-arm64": "1.1.0",
-        "@img/sharp-libvips-linux-ppc64": "1.1.0",
-        "@img/sharp-libvips-linux-s390x": "1.1.0",
-        "@img/sharp-libvips-linux-x64": "1.1.0",
-        "@img/sharp-libvips-linuxmusl-arm64": "1.1.0",
-        "@img/sharp-libvips-linuxmusl-x64": "1.1.0",
-        "@img/sharp-linux-arm": "0.34.1",
-        "@img/sharp-linux-arm64": "0.34.1",
-        "@img/sharp-linux-s390x": "0.34.1",
-        "@img/sharp-linux-x64": "0.34.1",
-        "@img/sharp-linuxmusl-arm64": "0.34.1",
-        "@img/sharp-linuxmusl-x64": "0.34.1",
-        "@img/sharp-wasm32": "0.34.1",
-        "@img/sharp-win32-ia32": "0.34.1",
-        "@img/sharp-win32-x64": "0.34.1"
+        "@img/sharp-darwin-arm64": "0.34.3",
+        "@img/sharp-darwin-x64": "0.34.3",
+        "@img/sharp-libvips-darwin-arm64": "1.2.0",
+        "@img/sharp-libvips-darwin-x64": "1.2.0",
+        "@img/sharp-libvips-linux-arm": "1.2.0",
+        "@img/sharp-libvips-linux-arm64": "1.2.0",
+        "@img/sharp-libvips-linux-ppc64": "1.2.0",
+        "@img/sharp-libvips-linux-s390x": "1.2.0",
+        "@img/sharp-libvips-linux-x64": "1.2.0",
+        "@img/sharp-libvips-linuxmusl-arm64": "1.2.0",
+        "@img/sharp-libvips-linuxmusl-x64": "1.2.0",
+        "@img/sharp-linux-arm": "0.34.3",
+        "@img/sharp-linux-arm64": "0.34.3",
+        "@img/sharp-linux-ppc64": "0.34.3",
+        "@img/sharp-linux-s390x": "0.34.3",
+        "@img/sharp-linux-x64": "0.34.3",
+        "@img/sharp-linuxmusl-arm64": "0.34.3",
+        "@img/sharp-linuxmusl-x64": "0.34.3",
+        "@img/sharp-wasm32": "0.34.3",
+        "@img/sharp-win32-arm64": "0.34.3",
+        "@img/sharp-win32-ia32": "0.34.3",
+        "@img/sharp-win32-x64": "0.34.3"
       }
     },
     "node_modules/shebang-command": {
@@ -13403,9 +13437,9 @@
       "license": "ISC"
     },
     "node_modules/simple-swizzle": {
-      "version": "0.2.2",
-      "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz",
-      "integrity": "sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==",
+      "version": "0.2.4",
+      "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.4.tgz",
+      "integrity": "sha512-nAu1WFPQSMNr2Zn9PGSZK9AGn4t/y97lEm+MXTtUDwfP0ksAIX4nO+6ruD9Jwut4C49SB1Ws+fbXsm/yScWOHw==",
       "license": "MIT",
       "optional": true,
       "dependencies": {
@@ -13526,14 +13560,6 @@
         "node": ">= 0.8"
       }
     },
-    "node_modules/streamsearch": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz",
-      "integrity": "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==",
-      "engines": {
-        "node": ">=10.0.0"
-      }
-    },
     "node_modules/string-length": {
       "version": "4.0.2",
       "resolved": "https://registry.npmjs.org/string-length/-/string-length-4.0.2.tgz",
diff --git a/llama_stack/ui/package.json b/llama_stack/ui/package.json
index e50401fa6..a0a8b2c7c 100644
--- a/llama_stack/ui/package.json
+++ b/llama_stack/ui/package.json
@@ -25,7 +25,7 @@
     "framer-motion": "^12.23.12",
     "llama-stack-client": "^0.2.21",
     "lucide-react": "^0.542.0",
-    "next": "15.3.3",
+    "next": "15.5.3",
     "next-auth": "^4.24.11",
     "next-themes": "^0.4.6",
     "react": "^19.0.0",

From b6cb8178976b941a1fdb3894b00bd13eaca91561 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 15 Sep 2025 09:46:14 +0200
Subject: [PATCH 29/30] chore(ui-deps): bump @radix-ui/react-select from 2.2.5
 to 2.2.6 in /llama_stack/ui (#3437)

Bumps [@radix-ui/react-select](https://github.com/radix-ui/primitives)
from 2.2.5 to 2.2.6.
<details>
<summary>Commits</summary>
<ul>
<li>See full diff in <a
href="https://github.com/radix-ui/primitives/commits">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=@radix-ui/react-select&package-manager=npm_and_yarn&previous-version=2.2.5&new-version=2.2.6)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 llama_stack/ui/package-lock.json | 77 ++++++++++++++------------------
 llama_stack/ui/package.json      |  2 +-
 2 files changed, 34 insertions(+), 45 deletions(-)

diff --git a/llama_stack/ui/package-lock.json b/llama_stack/ui/package-lock.json
index ff73fa2e8..f333aa809 100644
--- a/llama_stack/ui/package-lock.json
+++ b/llama_stack/ui/package-lock.json
@@ -11,7 +11,7 @@
         "@radix-ui/react-collapsible": "^1.1.12",
         "@radix-ui/react-dialog": "^1.1.13",
         "@radix-ui/react-dropdown-menu": "^2.1.16",
-        "@radix-ui/react-select": "^2.2.5",
+        "@radix-ui/react-select": "^2.2.6",
         "@radix-ui/react-separator": "^1.1.7",
         "@radix-ui/react-slot": "^1.2.3",
         "@radix-ui/react-tooltip": "^1.2.8",
@@ -2924,22 +2924,22 @@
       }
     },
     "node_modules/@radix-ui/react-select": {
-      "version": "2.2.5",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-select/-/react-select-2.2.5.tgz",
-      "integrity": "sha512-HnMTdXEVuuyzx63ME0ut4+sEMYW6oouHWNGUZc7ddvUWIcfCva/AMoqEW/3wnEllriMWBa0RHspCYnfCWJQYmA==",
+      "version": "2.2.6",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-select/-/react-select-2.2.6.tgz",
+      "integrity": "sha512-I30RydO+bnn2PQztvo25tswPH+wFBjehVGtmagkU78yMdwTwVf12wnAOF+AeP8S2N8xD+5UPbGhkUfPyvT+mwQ==",
       "license": "MIT",
       "dependencies": {
         "@radix-ui/number": "1.1.1",
-        "@radix-ui/primitive": "1.1.2",
+        "@radix-ui/primitive": "1.1.3",
         "@radix-ui/react-collection": "1.1.7",
         "@radix-ui/react-compose-refs": "1.1.2",
         "@radix-ui/react-context": "1.1.2",
         "@radix-ui/react-direction": "1.1.1",
-        "@radix-ui/react-dismissable-layer": "1.1.10",
-        "@radix-ui/react-focus-guards": "1.1.2",
+        "@radix-ui/react-dismissable-layer": "1.1.11",
+        "@radix-ui/react-focus-guards": "1.1.3",
         "@radix-ui/react-focus-scope": "1.1.7",
         "@radix-ui/react-id": "1.1.1",
-        "@radix-ui/react-popper": "1.2.7",
+        "@radix-ui/react-popper": "1.2.8",
         "@radix-ui/react-portal": "1.1.9",
         "@radix-ui/react-primitive": "2.1.3",
         "@radix-ui/react-slot": "1.2.3",
@@ -2966,13 +2966,19 @@
         }
       }
     },
+    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/primitive": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz",
+      "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
+      "license": "MIT"
+    },
     "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-dismissable-layer": {
-      "version": "1.1.10",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.10.tgz",
-      "integrity": "sha512-IM1zzRV4W3HtVgftdQiiOmA0AdJlCtMLe00FXaHwgt3rAnNsIyDqshvkIW3hj/iu5hu8ERP7KIYki6NkqDxAwQ==",
+      "version": "1.1.11",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.11.tgz",
+      "integrity": "sha512-Nqcp+t5cTB8BinFkZgXiMJniQH0PsUt2k51FUhbdfeKvc4ACcG2uQniY/8+h1Yv6Kza4Q7lD7PQV0z0oicE0Mg==",
       "license": "MIT",
       "dependencies": {
-        "@radix-ui/primitive": "1.1.2",
+        "@radix-ui/primitive": "1.1.3",
         "@radix-ui/react-compose-refs": "1.1.2",
         "@radix-ui/react-primitive": "2.1.3",
         "@radix-ui/react-use-callback-ref": "1.1.1",
@@ -2993,6 +2999,21 @@
         }
       }
     },
+    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-focus-guards": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.1.3.tgz",
+      "integrity": "sha512-0rFg/Rj2Q62NCm62jZw0QX7a3sz6QCQU0LpZdNrJX8byRGaGVTqbrW9jAoIAHyMQqsNpeZ81YgSizOt5WXq0Pw==",
+      "license": "MIT",
+      "peerDependencies": {
+        "@types/react": "*",
+        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        }
+      }
+    },
     "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-focus-scope": {
       "version": "1.1.7",
       "resolved": "https://registry.npmjs.org/@radix-ui/react-focus-scope/-/react-focus-scope-1.1.7.tgz",
@@ -3018,38 +3039,6 @@
         }
       }
     },
-    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-popper": {
-      "version": "1.2.7",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.7.tgz",
-      "integrity": "sha512-IUFAccz1JyKcf/RjB552PlWwxjeCJB8/4KxT7EhBHOJM+mN7LdW+B3kacJXILm32xawcMMjb2i0cIZpo+f9kiQ==",
-      "license": "MIT",
-      "dependencies": {
-        "@floating-ui/react-dom": "^2.0.0",
-        "@radix-ui/react-arrow": "1.1.7",
-        "@radix-ui/react-compose-refs": "1.1.2",
-        "@radix-ui/react-context": "1.1.2",
-        "@radix-ui/react-primitive": "2.1.3",
-        "@radix-ui/react-use-callback-ref": "1.1.1",
-        "@radix-ui/react-use-layout-effect": "1.1.1",
-        "@radix-ui/react-use-rect": "1.1.1",
-        "@radix-ui/react-use-size": "1.1.1",
-        "@radix-ui/rect": "1.1.1"
-      },
-      "peerDependencies": {
-        "@types/react": "*",
-        "@types/react-dom": "*",
-        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
-        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
-      },
-      "peerDependenciesMeta": {
-        "@types/react": {
-          "optional": true
-        },
-        "@types/react-dom": {
-          "optional": true
-        }
-      }
-    },
     "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-portal": {
       "version": "1.1.9",
       "resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz",
diff --git a/llama_stack/ui/package.json b/llama_stack/ui/package.json
index a0a8b2c7c..ccbc2a4c2 100644
--- a/llama_stack/ui/package.json
+++ b/llama_stack/ui/package.json
@@ -16,7 +16,7 @@
     "@radix-ui/react-collapsible": "^1.1.12",
     "@radix-ui/react-dialog": "^1.1.13",
     "@radix-ui/react-dropdown-menu": "^2.1.16",
-    "@radix-ui/react-select": "^2.2.5",
+    "@radix-ui/react-select": "^2.2.6",
     "@radix-ui/react-separator": "^1.1.7",
     "@radix-ui/react-slot": "^1.2.3",
     "@radix-ui/react-tooltip": "^1.2.8",

From 01bdcce4d2218754acfe960de58598bc50e32d21 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Mon, 15 Sep 2025 15:25:53 -0400
Subject: [PATCH 30/30] chore(recorder): update mocks to be closer to non-mock
 environment (#3442)

# What does this PR do?

the @required_args decorator in openai-python is masking the async
nature of the {AsyncCompletions,chat.AsyncCompletions}.create method.
see https://github.com/openai/openai-python/issues/996

this means two things -

 0. we cannot use iscoroutine in the recorder to detect async vs non
 1. our mocks are inappropriately introducing identifiable async

for (0), we update the iscoroutine check w/ detection of /v1/models,
which is the only non-async function we mock & record.

for (1), we could leave everything as is and assume (0) will catch
errors. to be defensive, we update the unit tests to mock below create
methods, allowing the true openai-python create() methods to be tested.
---
 llama_stack/testing/inference_recorder.py     |  14 +-
 .../distribution/test_inference_recordings.py | 208 +++++++++---------
 2 files changed, 113 insertions(+), 109 deletions(-)

diff --git a/llama_stack/testing/inference_recorder.py b/llama_stack/testing/inference_recorder.py
index f899d73d3..674016fb1 100644
--- a/llama_stack/testing/inference_recorder.py
+++ b/llama_stack/testing/inference_recorder.py
@@ -7,7 +7,6 @@
 from __future__ import annotations  # for forward references
 
 import hashlib
-import inspect
 import json
 import os
 from collections.abc import Generator
@@ -243,11 +242,10 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
     global _current_mode, _current_storage
 
     if _current_mode == InferenceMode.LIVE or _current_storage is None:
-        # Normal operation
-        if inspect.iscoroutinefunction(original_method):
-            return await original_method(self, *args, **kwargs)
-        else:
+        if endpoint == "/v1/models":
             return original_method(self, *args, **kwargs)
+        else:
+            return await original_method(self, *args, **kwargs)
 
     # Get base URL based on client type
     if client_type == "openai":
@@ -298,10 +296,10 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
             )
 
     elif _current_mode == InferenceMode.RECORD:
-        if inspect.iscoroutinefunction(original_method):
-            response = await original_method(self, *args, **kwargs)
-        else:
+        if endpoint == "/v1/models":
             response = original_method(self, *args, **kwargs)
+        else:
+            response = await original_method(self, *args, **kwargs)
 
         # we want to store the result of the iterator, not the iterator itself
         if endpoint == "/v1/models":
diff --git a/tests/unit/distribution/test_inference_recordings.py b/tests/unit/distribution/test_inference_recordings.py
index 4909bbe1e..5740357c1 100644
--- a/tests/unit/distribution/test_inference_recordings.py
+++ b/tests/unit/distribution/test_inference_recordings.py
@@ -155,27 +155,22 @@ class TestInferenceRecording:
 
     async def test_recording_mode(self, temp_storage_dir, real_openai_chat_response):
         """Test that recording mode captures and stores responses."""
-
-        async def mock_create(*args, **kwargs):
-            return real_openai_chat_response
-
         temp_storage_dir = temp_storage_dir / "test_recording_mode"
-        with patch(
-            "openai.resources.chat.completions.AsyncCompletions.create", new_callable=AsyncMock, side_effect=mock_create
-        ):
-            with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
-                client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+        with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
+            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+            client.chat.completions._post = AsyncMock(return_value=real_openai_chat_response)
 
-                response = await client.chat.completions.create(
-                    model="llama3.2:3b",
-                    messages=[{"role": "user", "content": "Hello, how are you?"}],
-                    temperature=0.7,
-                    max_tokens=50,
-                    user=NOT_GIVEN,
-                )
+            response = await client.chat.completions.create(
+                model="llama3.2:3b",
+                messages=[{"role": "user", "content": "Hello, how are you?"}],
+                temperature=0.7,
+                max_tokens=50,
+                user=NOT_GIVEN,
+            )
 
-                # Verify the response was returned correctly
-                assert response.choices[0].message.content == "Hello! I'm doing well, thank you for asking."
+            # Verify the response was returned correctly
+            assert response.choices[0].message.content == "Hello! I'm doing well, thank you for asking."
+            client.chat.completions._post.assert_called_once()
 
         # Verify recording was stored
         storage = ResponseStorage(temp_storage_dir)
@@ -183,43 +178,38 @@ class TestInferenceRecording:
 
     async def test_replay_mode(self, temp_storage_dir, real_openai_chat_response):
         """Test that replay mode returns stored responses without making real calls."""
-
-        async def mock_create(*args, **kwargs):
-            return real_openai_chat_response
-
         temp_storage_dir = temp_storage_dir / "test_replay_mode"
         # First, record a response
-        with patch(
-            "openai.resources.chat.completions.AsyncCompletions.create", new_callable=AsyncMock, side_effect=mock_create
-        ):
-            with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
-                client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+        with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
+            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+            client.chat.completions._post = AsyncMock(return_value=real_openai_chat_response)
 
-                response = await client.chat.completions.create(
-                    model="llama3.2:3b",
-                    messages=[{"role": "user", "content": "Hello, how are you?"}],
-                    temperature=0.7,
-                    max_tokens=50,
-                    user=NOT_GIVEN,
-                )
+            response = await client.chat.completions.create(
+                model="llama3.2:3b",
+                messages=[{"role": "user", "content": "Hello, how are you?"}],
+                temperature=0.7,
+                max_tokens=50,
+                user=NOT_GIVEN,
+            )
+            client.chat.completions._post.assert_called_once()
 
         # Now test replay mode - should not call the original method
-        with patch("openai.resources.chat.completions.AsyncCompletions.create") as mock_create_patch:
-            with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)):
-                client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+        with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)):
+            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+            client.chat.completions._post = AsyncMock(return_value=real_openai_chat_response)
 
-                response = await client.chat.completions.create(
-                    model="llama3.2:3b",
-                    messages=[{"role": "user", "content": "Hello, how are you?"}],
-                    temperature=0.7,
-                    max_tokens=50,
-                )
+            response = await client.chat.completions.create(
+                model="llama3.2:3b",
+                messages=[{"role": "user", "content": "Hello, how are you?"}],
+                temperature=0.7,
+                max_tokens=50,
+            )
 
-                # Verify we got the recorded response
-                assert response.choices[0].message.content == "Hello! I'm doing well, thank you for asking."
+            # Verify we got the recorded response
+            assert response.choices[0].message.content == "Hello! I'm doing well, thank you for asking."
 
-                # Verify the original method was NOT called
-                mock_create_patch.assert_not_called()
+            # Verify the original method was NOT called
+            client.chat.completions._post.assert_not_called()
 
     async def test_replay_mode_models(self, temp_storage_dir):
         """Test that replay mode returns stored responses without making real model listing calls."""
@@ -272,43 +262,50 @@ class TestInferenceRecording:
     async def test_embeddings_recording(self, temp_storage_dir, real_embeddings_response):
         """Test recording and replay of embeddings calls."""
 
-        async def mock_create(*args, **kwargs):
-            return real_embeddings_response
+        # baseline - mock works without recording
+        client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+        client.embeddings._post = AsyncMock(return_value=real_embeddings_response)
+        response = await client.embeddings.create(
+            model=real_embeddings_response.model,
+            input=["Hello world", "Test embedding"],
+            encoding_format=NOT_GIVEN,
+        )
+        assert len(response.data) == 2
+        assert response.data[0].embedding == [0.1, 0.2, 0.3]
+        client.embeddings._post.assert_called_once()
 
         temp_storage_dir = temp_storage_dir / "test_embeddings_recording"
         # Record
-        with patch(
-            "openai.resources.embeddings.AsyncEmbeddings.create", new_callable=AsyncMock, side_effect=mock_create
-        ):
-            with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
-                client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+        with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
+            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+            client.embeddings._post = AsyncMock(return_value=real_embeddings_response)
 
-                response = await client.embeddings.create(
-                    model=real_embeddings_response.model,
-                    input=["Hello world", "Test embedding"],
-                    encoding_format=NOT_GIVEN,
-                    dimensions=NOT_GIVEN,
-                    user=NOT_GIVEN,
-                )
+            response = await client.embeddings.create(
+                model=real_embeddings_response.model,
+                input=["Hello world", "Test embedding"],
+                encoding_format=NOT_GIVEN,
+                dimensions=NOT_GIVEN,
+                user=NOT_GIVEN,
+            )
 
-                assert len(response.data) == 2
+            assert len(response.data) == 2
 
         # Replay
-        with patch("openai.resources.embeddings.AsyncEmbeddings.create") as mock_create_patch:
-            with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)):
-                client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+        with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)):
+            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+            client.embeddings._post = AsyncMock(return_value=real_embeddings_response)
 
-                response = await client.embeddings.create(
-                    model=real_embeddings_response.model,
-                    input=["Hello world", "Test embedding"],
-                )
+            response = await client.embeddings.create(
+                model=real_embeddings_response.model,
+                input=["Hello world", "Test embedding"],
+            )
 
-                # Verify we got the recorded response
-                assert len(response.data) == 2
-                assert response.data[0].embedding == [0.1, 0.2, 0.3]
+            # Verify we got the recorded response
+            assert len(response.data) == 2
+            assert response.data[0].embedding == [0.1, 0.2, 0.3]
 
-                # Verify original method was not called
-                mock_create_patch.assert_not_called()
+            # Verify original method was not called
+            client.embeddings._post.assert_not_called()
 
     async def test_completions_recording(self, temp_storage_dir):
         real_completions_response = OpenAICompletion(
@@ -326,40 +323,49 @@ class TestInferenceRecording:
             ],
         )
 
-        async def mock_create(*args, **kwargs):
-            return real_completions_response
-
         temp_storage_dir = temp_storage_dir / "test_completions_recording"
 
+        # baseline - mock works without recording
+        client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+        client.completions._post = AsyncMock(return_value=real_completions_response)
+        response = await client.completions.create(
+            model=real_completions_response.model,
+            prompt="Hello, how are you?",
+            temperature=0.7,
+            max_tokens=50,
+            user=NOT_GIVEN,
+        )
+        assert response.choices[0].text == real_completions_response.choices[0].text
+        client.completions._post.assert_called_once()
+
         # Record
-        with patch(
-            "openai.resources.completions.AsyncCompletions.create", new_callable=AsyncMock, side_effect=mock_create
-        ):
-            with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
-                client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+        with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
+            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+            client.completions._post = AsyncMock(return_value=real_completions_response)
 
-                response = await client.completions.create(
-                    model=real_completions_response.model,
-                    prompt="Hello, how are you?",
-                    temperature=0.7,
-                    max_tokens=50,
-                    user=NOT_GIVEN,
-                )
+            response = await client.completions.create(
+                model=real_completions_response.model,
+                prompt="Hello, how are you?",
+                temperature=0.7,
+                max_tokens=50,
+                user=NOT_GIVEN,
+            )
 
-                assert response.choices[0].text == real_completions_response.choices[0].text
+            assert response.choices[0].text == real_completions_response.choices[0].text
+            client.completions._post.assert_called_once()
 
         # Replay
-        with patch("openai.resources.completions.AsyncCompletions.create") as mock_create_patch:
-            with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)):
-                client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
-                response = await client.completions.create(
-                    model=real_completions_response.model,
-                    prompt="Hello, how are you?",
-                    temperature=0.7,
-                    max_tokens=50,
-                )
-                assert response.choices[0].text == real_completions_response.choices[0].text
-                mock_create_patch.assert_not_called()
+        with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)):
+            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+            client.completions._post = AsyncMock(return_value=real_completions_response)
+            response = await client.completions.create(
+                model=real_completions_response.model,
+                prompt="Hello, how are you?",
+                temperature=0.7,
+                max_tokens=50,
+            )
+            assert response.choices[0].text == real_completions_response.choices[0].text
+            client.completions._post.assert_not_called()
 
     async def test_live_mode(self, real_openai_chat_response):
         """Test that live mode passes through to original methods."""