From b4974d411d3a2557ad679822c2372e355cd49532 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Fri, 19 Sep 2025 14:59:30 -0700
Subject: [PATCH 01/33] chore: simplify authorized sqlstore

# What does this PR do?


## Test Plan
---
 .../providers/inline/files/localfs/files.py   |  5 ++---
 .../providers/remote/files/s3/files.py        |  5 ++---
 .../utils/inference/inference_store.py        |  4 +---
 .../utils/responses/responses_store.py        |  7 ++-----
 .../utils/sqlstore/authorized_sqlstore.py     | 11 +++++------
 .../sqlstore/test_authorized_sqlstore.py      | 19 +++++++++++--------
 tests/unit/utils/test_authorized_sqlstore.py  | 18 +++++++++---------
 7 files changed, 32 insertions(+), 37 deletions(-)

diff --git a/llama_stack/providers/inline/files/localfs/files.py b/llama_stack/providers/inline/files/localfs/files.py
index 9c610c1ba..65cf8d815 100644
--- a/llama_stack/providers/inline/files/localfs/files.py
+++ b/llama_stack/providers/inline/files/localfs/files.py
@@ -44,7 +44,7 @@ class LocalfsFilesImpl(Files):
         storage_path.mkdir(parents=True, exist_ok=True)
 
         # Initialize SQL store for metadata
-        self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.config.metadata_store))
+        self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.config.metadata_store), self.policy)
         await self.sql_store.create_table(
             "openai_files",
             {
@@ -74,7 +74,7 @@ class LocalfsFilesImpl(Files):
         if not self.sql_store:
             raise RuntimeError("Files provider not initialized")
 
-        row = await self.sql_store.fetch_one("openai_files", policy=self.policy, where={"id": file_id})
+        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
         if not row:
             raise ResourceNotFoundError(file_id, "File", "client.files.list()")
 
@@ -150,7 +150,6 @@ class LocalfsFilesImpl(Files):
 
         paginated_result = await self.sql_store.fetch_all(
             table="openai_files",
-            policy=self.policy,
             where=where_conditions if where_conditions else None,
             order_by=[("created_at", order.value)],
             cursor=("id", after) if after else None,
diff --git a/llama_stack/providers/remote/files/s3/files.py b/llama_stack/providers/remote/files/s3/files.py
index 54742d900..8ea96af9e 100644
--- a/llama_stack/providers/remote/files/s3/files.py
+++ b/llama_stack/providers/remote/files/s3/files.py
@@ -137,7 +137,7 @@ class S3FilesImpl(Files):
         where: dict[str, str | dict] = {"id": file_id}
         if not return_expired:
             where["expires_at"] = {">": self._now()}
-        if not (row := await self.sql_store.fetch_one("openai_files", policy=self.policy, where=where)):
+        if not (row := await self.sql_store.fetch_one("openai_files", where=where)):
             raise ResourceNotFoundError(file_id, "File", "files.list()")
         return row
 
@@ -164,7 +164,7 @@ class S3FilesImpl(Files):
         self._client = _create_s3_client(self._config)
         await _create_bucket_if_not_exists(self._client, self._config)
 
-        self._sql_store = AuthorizedSqlStore(sqlstore_impl(self._config.metadata_store))
+        self._sql_store = AuthorizedSqlStore(sqlstore_impl(self._config.metadata_store), self.policy)
         await self._sql_store.create_table(
             "openai_files",
             {
@@ -268,7 +268,6 @@ class S3FilesImpl(Files):
 
         paginated_result = await self.sql_store.fetch_all(
             table="openai_files",
-            policy=self.policy,
             where=where_conditions,
             order_by=[("created_at", order.value)],
             cursor=("id", after) if after else None,
diff --git a/llama_stack/providers/utils/inference/inference_store.py b/llama_stack/providers/utils/inference/inference_store.py
index 17f4c6268..ffc9f3e11 100644
--- a/llama_stack/providers/utils/inference/inference_store.py
+++ b/llama_stack/providers/utils/inference/inference_store.py
@@ -54,7 +54,7 @@ class InferenceStore:
 
     async def initialize(self):
         """Create the necessary tables if they don't exist."""
-        self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config))
+        self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config), self.policy)
         await self.sql_store.create_table(
             "chat_completions",
             {
@@ -202,7 +202,6 @@ class InferenceStore:
             order_by=[("created", order.value)],
             cursor=("id", after) if after else None,
             limit=limit,
-            policy=self.policy,
         )
 
         data = [
@@ -229,7 +228,6 @@ class InferenceStore:
         row = await self.sql_store.fetch_one(
             table="chat_completions",
             where={"id": completion_id},
-            policy=self.policy,
         )
 
         if not row:
diff --git a/llama_stack/providers/utils/responses/responses_store.py b/llama_stack/providers/utils/responses/responses_store.py
index 04778ed1c..829cd8a62 100644
--- a/llama_stack/providers/utils/responses/responses_store.py
+++ b/llama_stack/providers/utils/responses/responses_store.py
@@ -28,8 +28,7 @@ class ResponsesStore:
             sql_store_config = SqliteSqlStoreConfig(
                 db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
             )
-        self.sql_store = AuthorizedSqlStore(sqlstore_impl(sql_store_config))
-        self.policy = policy
+        self.sql_store = AuthorizedSqlStore(sqlstore_impl(sql_store_config), policy)
 
     async def initialize(self):
         """Create the necessary tables if they don't exist."""
@@ -87,7 +86,6 @@ class ResponsesStore:
             order_by=[("created_at", order.value)],
             cursor=("id", after) if after else None,
             limit=limit,
-            policy=self.policy,
         )
 
         data = [OpenAIResponseObjectWithInput(**row["response_object"]) for row in paginated_result.data]
@@ -105,7 +103,6 @@ class ResponsesStore:
         row = await self.sql_store.fetch_one(
             "openai_responses",
             where={"id": response_id},
-            policy=self.policy,
         )
 
         if not row:
@@ -116,7 +113,7 @@ class ResponsesStore:
         return OpenAIResponseObjectWithInput(**row["response_object"])
 
     async def delete_response_object(self, response_id: str) -> OpenAIDeleteResponseObject:
-        row = await self.sql_store.fetch_one("openai_responses", where={"id": response_id}, policy=self.policy)
+        row = await self.sql_store.fetch_one("openai_responses", where={"id": response_id})
         if not row:
             raise ValueError(f"Response with id {response_id} not found")
         await self.sql_store.delete("openai_responses", where={"id": response_id})
diff --git a/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py b/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py
index acb688f96..ab67f7052 100644
--- a/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py
+++ b/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py
@@ -53,13 +53,15 @@ class AuthorizedSqlStore:
     access control policies, user attribute capture, and SQL filtering optimization.
     """
 
-    def __init__(self, sql_store: SqlStore):
+    def __init__(self, sql_store: SqlStore, policy: list[AccessRule]):
         """
         Initialize the authorization layer.
 
         :param sql_store: Base SqlStore implementation to wrap
+        :param policy: Access control policy to use for authorization
         """
         self.sql_store = sql_store
+        self.policy = policy
         self._detect_database_type()
         self._validate_sql_optimized_policy()
 
@@ -117,14 +119,13 @@ class AuthorizedSqlStore:
     async def fetch_all(
         self,
         table: str,
-        policy: list[AccessRule],
         where: Mapping[str, Any] | None = None,
         limit: int | None = None,
         order_by: list[tuple[str, Literal["asc", "desc"]]] | None = None,
         cursor: tuple[str, str] | None = None,
     ) -> PaginatedResponse:
         """Fetch all rows with automatic access control filtering."""
-        access_where = self._build_access_control_where_clause(policy)
+        access_where = self._build_access_control_where_clause(self.policy)
         rows = await self.sql_store.fetch_all(
             table=table,
             where=where,
@@ -146,7 +147,7 @@ class AuthorizedSqlStore:
                 str(record_id), table, User(principal=stored_owner_principal, attributes=stored_access_attrs)
             )
 
-            if is_action_allowed(policy, Action.READ, sql_record, current_user):
+            if is_action_allowed(self.policy, Action.READ, sql_record, current_user):
                 filtered_rows.append(row)
 
         return PaginatedResponse(
@@ -157,14 +158,12 @@ class AuthorizedSqlStore:
     async def fetch_one(
         self,
         table: str,
-        policy: list[AccessRule],
         where: Mapping[str, Any] | None = None,
         order_by: list[tuple[str, Literal["asc", "desc"]]] | None = None,
     ) -> dict[str, Any] | None:
         """Fetch one row with automatic access control checking."""
         results = await self.fetch_all(
             table=table,
-            policy=policy,
             where=where,
             limit=1,
             order_by=order_by,
diff --git a/tests/integration/providers/utils/sqlstore/test_authorized_sqlstore.py b/tests/integration/providers/utils/sqlstore/test_authorized_sqlstore.py
index 4002f2e1f..98bef0f2c 100644
--- a/tests/integration/providers/utils/sqlstore/test_authorized_sqlstore.py
+++ b/tests/integration/providers/utils/sqlstore/test_authorized_sqlstore.py
@@ -57,7 +57,7 @@ def authorized_store(backend_config):
     config = config_func()
 
     base_sqlstore = sqlstore_impl(config)
-    authorized_store = AuthorizedSqlStore(base_sqlstore)
+    authorized_store = AuthorizedSqlStore(base_sqlstore, default_policy())
 
     yield authorized_store
 
@@ -106,7 +106,7 @@ async def test_authorized_store_attributes(mock_get_authenticated_user, authoriz
         await authorized_store.insert(table_name, {"id": "1", "data": "public_data"})
 
         # Test fetching with no user - should not error on JSON comparison
-        result = await authorized_store.fetch_all(table_name, policy=default_policy())
+        result = await authorized_store.fetch_all(table_name)
         assert len(result.data) == 1
         assert result.data[0]["id"] == "1"
         assert result.data[0]["access_attributes"] is None
@@ -119,7 +119,7 @@ async def test_authorized_store_attributes(mock_get_authenticated_user, authoriz
         await authorized_store.insert(table_name, {"id": "2", "data": "admin_data"})
 
         # Fetch all - admin should see both
-        result = await authorized_store.fetch_all(table_name, policy=default_policy())
+        result = await authorized_store.fetch_all(table_name)
         assert len(result.data) == 2
 
         # Test with non-admin user
@@ -127,7 +127,7 @@ async def test_authorized_store_attributes(mock_get_authenticated_user, authoriz
         mock_get_authenticated_user.return_value = regular_user
 
         # Should only see public record
-        result = await authorized_store.fetch_all(table_name, policy=default_policy())
+        result = await authorized_store.fetch_all(table_name)
         assert len(result.data) == 1
         assert result.data[0]["id"] == "1"
 
@@ -156,7 +156,7 @@ async def test_authorized_store_attributes(mock_get_authenticated_user, authoriz
 
         # Now test with the multi-user who has both roles=admin and teams=dev
         mock_get_authenticated_user.return_value = multi_user
-        result = await authorized_store.fetch_all(table_name, policy=default_policy())
+        result = await authorized_store.fetch_all(table_name)
 
         # Should see:
         # - public record (1) - no access_attributes
@@ -217,21 +217,24 @@ async def test_user_ownership_policy(mock_get_authenticated_user, authorized_sto
             ),
         ]
 
+        # Create a new authorized store with the owner-only policy
+        owner_only_store = AuthorizedSqlStore(authorized_store.sql_store, owner_only_policy)
+
         # Test user1 access - should only see their own record
         mock_get_authenticated_user.return_value = user1
-        result = await authorized_store.fetch_all(table_name, policy=owner_only_policy)
+        result = await owner_only_store.fetch_all(table_name)
         assert len(result.data) == 1, f"Expected user1 to see 1 record, got {len(result.data)}"
         assert result.data[0]["id"] == "1", f"Expected user1's record, got {result.data[0]['id']}"
 
         # Test user2 access - should only see their own record
         mock_get_authenticated_user.return_value = user2
-        result = await authorized_store.fetch_all(table_name, policy=owner_only_policy)
+        result = await owner_only_store.fetch_all(table_name)
         assert len(result.data) == 1, f"Expected user2 to see 1 record, got {len(result.data)}"
         assert result.data[0]["id"] == "2", f"Expected user2's record, got {result.data[0]['id']}"
 
         # Test with anonymous user - should see no records
         mock_get_authenticated_user.return_value = None
-        result = await authorized_store.fetch_all(table_name, policy=owner_only_policy)
+        result = await owner_only_store.fetch_all(table_name)
         assert len(result.data) == 0, f"Expected anonymous user to see 0 records, got {len(result.data)}"
 
     finally:
diff --git a/tests/unit/utils/test_authorized_sqlstore.py b/tests/unit/utils/test_authorized_sqlstore.py
index 90eb706e4..d85e784a9 100644
--- a/tests/unit/utils/test_authorized_sqlstore.py
+++ b/tests/unit/utils/test_authorized_sqlstore.py
@@ -26,7 +26,7 @@ async def test_authorized_fetch_with_where_sql_access_control(mock_get_authentic
                 db_path=tmp_dir + "/" + db_name,
             )
         )
-        sqlstore = AuthorizedSqlStore(base_sqlstore)
+        sqlstore = AuthorizedSqlStore(base_sqlstore, default_policy())
 
         # Create table with access control
         await sqlstore.create_table(
@@ -56,24 +56,24 @@ async def test_authorized_fetch_with_where_sql_access_control(mock_get_authentic
         mock_get_authenticated_user.return_value = admin_user
 
         # Admin should see both documents
-        result = await sqlstore.fetch_all("documents", policy=default_policy(), where={"id": 1})
+        result = await sqlstore.fetch_all("documents", where={"id": 1})
         assert len(result.data) == 1
         assert result.data[0]["title"] == "Admin Document"
 
         # User should only see their document
         mock_get_authenticated_user.return_value = regular_user
 
-        result = await sqlstore.fetch_all("documents", policy=default_policy(), where={"id": 1})
+        result = await sqlstore.fetch_all("documents", where={"id": 1})
         assert len(result.data) == 0
 
-        result = await sqlstore.fetch_all("documents", policy=default_policy(), where={"id": 2})
+        result = await sqlstore.fetch_all("documents", where={"id": 2})
         assert len(result.data) == 1
         assert result.data[0]["title"] == "User Document"
 
-        row = await sqlstore.fetch_one("documents", policy=default_policy(), where={"id": 1})
+        row = await sqlstore.fetch_one("documents", where={"id": 1})
         assert row is None
 
-        row = await sqlstore.fetch_one("documents", policy=default_policy(), where={"id": 2})
+        row = await sqlstore.fetch_one("documents", where={"id": 2})
         assert row is not None
         assert row["title"] == "User Document"
 
@@ -88,7 +88,7 @@ async def test_sql_policy_consistency(mock_get_authenticated_user):
                 db_path=tmp_dir + "/" + db_name,
             )
         )
-        sqlstore = AuthorizedSqlStore(base_sqlstore)
+        sqlstore = AuthorizedSqlStore(base_sqlstore, default_policy())
 
         await sqlstore.create_table(
             table="resources",
@@ -144,7 +144,7 @@ async def test_sql_policy_consistency(mock_get_authenticated_user):
             user = User(principal=user_data["principal"], attributes=user_data["attributes"])
             mock_get_authenticated_user.return_value = user
 
-            sql_results = await sqlstore.fetch_all("resources", policy=policy)
+            sql_results = await sqlstore.fetch_all("resources")
             sql_ids = {row["id"] for row in sql_results.data}
             policy_ids = set()
             for scenario in test_scenarios:
@@ -174,7 +174,7 @@ async def test_authorized_store_user_attribute_capture(mock_get_authenticated_us
                 db_path=tmp_dir + "/" + db_name,
             )
         )
-        authorized_store = AuthorizedSqlStore(base_sqlstore)
+        authorized_store = AuthorizedSqlStore(base_sqlstore, default_policy())
 
         await authorized_store.create_table(
             table="user_data",

From f0da887e793dce0a084a2239c1cef4ab11cd0156 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Fri, 19 Sep 2025 15:49:40 -0700
Subject: [PATCH 02/33] chore: introduce write queue for response_store

# What does this PR do?


## Test Plan
---
 llama_stack/core/datatypes.py                 |   6 ++
 .../utils/responses/responses_store.py        | 102 ++++++++++++++++--
 .../utils/responses/test_responses_store.py   |  21 ++++
 3 files changed, 123 insertions(+), 6 deletions(-)

diff --git a/llama_stack/core/datatypes.py b/llama_stack/core/datatypes.py
index b5558c66f..6a297f012 100644
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@@ -433,6 +433,12 @@ class InferenceStoreConfig(BaseModel):
     num_writers: int = Field(default=4, description="Number of concurrent background writers")
 
 
+class ResponsesStoreConfig(BaseModel):
+    sql_store_config: SqlStoreConfig
+    max_write_queue_size: int = Field(default=10000, description="Max queued writes for responses store")
+    num_writers: int = Field(default=4, description="Number of concurrent background writers")
+
+
 class StackRunConfig(BaseModel):
     version: int = LLAMA_STACK_RUN_CONFIG_VERSION
 
diff --git a/llama_stack/providers/utils/responses/responses_store.py b/llama_stack/providers/utils/responses/responses_store.py
index 04778ed1c..367b8aa94 100644
--- a/llama_stack/providers/utils/responses/responses_store.py
+++ b/llama_stack/providers/utils/responses/responses_store.py
@@ -3,6 +3,9 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import asyncio
+from typing import Any
+
 from llama_stack.apis.agents import (
     Order,
 )
@@ -14,25 +17,51 @@ from llama_stack.apis.agents.openai_responses import (
     OpenAIResponseObject,
     OpenAIResponseObjectWithInput,
 )
-from llama_stack.core.datatypes import AccessRule
+from llama_stack.core.datatypes import AccessRule, ResponsesStoreConfig
 from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR
+from llama_stack.log import get_logger
 
 from ..sqlstore.api import ColumnDefinition, ColumnType
 from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
-from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl
+from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, SqlStoreType, sqlstore_impl
+
+logger = get_logger(name=__name__, category="responses_store")
 
 
 class ResponsesStore:
-    def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]):
-        if not sql_store_config:
-            sql_store_config = SqliteSqlStoreConfig(
+    def __init__(
+        self,
+        config: ResponsesStoreConfig | SqlStoreConfig,
+        policy: list[AccessRule],
+    ):
+        # Handle backward compatibility
+        if not isinstance(config, ResponsesStoreConfig):
+            # Legacy: SqlStoreConfig passed directly as config
+            config = ResponsesStoreConfig(
+                sql_store_config=config,
+            )
+
+        self.config = config
+        self.sql_store_config = config.sql_store_config
+        if not self.sql_store_config:
+            self.sql_store_config = SqliteSqlStoreConfig(
                 db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
             )
-        self.sql_store = AuthorizedSqlStore(sqlstore_impl(sql_store_config))
+        self.sql_store = None
         self.policy = policy
 
+        # Disable write queue for SQLite to avoid concurrency issues
+        self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite
+
+        # Async write queue and worker control
+        self._queue: asyncio.Queue[tuple[OpenAIResponseObject, list[OpenAIResponseInput]]] | None = None
+        self._worker_tasks: list[asyncio.Task[Any]] = []
+        self._max_write_queue_size: int = config.max_write_queue_size
+        self._num_writers: int = max(1, config.num_writers)
+
     async def initialize(self):
         """Create the necessary tables if they don't exist."""
+        self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config))
         await self.sql_store.create_table(
             "openai_responses",
             {
@@ -43,9 +72,70 @@ class ResponsesStore:
             },
         )
 
+        if self.enable_write_queue:
+            self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
+            for _ in range(self._num_writers):
+                self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
+        else:
+            logger.info("Write queue disabled for SQLite to avoid concurrency issues")
+
+    async def shutdown(self) -> None:
+        if not self._worker_tasks:
+            return
+        if self._queue is not None:
+            await self._queue.join()
+        for t in self._worker_tasks:
+            if not t.done():
+                t.cancel()
+        for t in self._worker_tasks:
+            try:
+                await t
+            except asyncio.CancelledError:
+                pass
+        self._worker_tasks.clear()
+
+    async def flush(self) -> None:
+        """Wait for all queued writes to complete. Useful for testing."""
+        if self.enable_write_queue and self._queue is not None:
+            await self._queue.join()
+
     async def store_response_object(
         self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
     ) -> None:
+        if self.enable_write_queue:
+            if self._queue is None:
+                raise ValueError("Responses store is not initialized")
+            try:
+                self._queue.put_nowait((response_object, input))
+            except asyncio.QueueFull:
+                logger.warning(
+                    f"Write queue full; adding response id={getattr(response_object, 'id', '<unknown>')}"
+                )
+                await self._queue.put((response_object, input))
+        else:
+            await self._write_response_object(response_object, input)
+
+    async def _worker_loop(self) -> None:
+        assert self._queue is not None
+        while True:
+            try:
+                item = await self._queue.get()
+            except asyncio.CancelledError:
+                break
+            response_object, input = item
+            try:
+                await self._write_response_object(response_object, input)
+            except Exception as e:  # noqa: BLE001
+                logger.error(f"Error writing response object: {e}")
+            finally:
+                self._queue.task_done()
+
+    async def _write_response_object(
+        self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
+    ) -> None:
+        if self.sql_store is None:
+            raise ValueError("Responses store is not initialized")
+
         data = response_object.model_dump()
         data["input"] = [input_item.model_dump() for input_item in input]
 
diff --git a/tests/unit/utils/responses/test_responses_store.py b/tests/unit/utils/responses/test_responses_store.py
index 44d4b30da..4e5256c1b 100644
--- a/tests/unit/utils/responses/test_responses_store.py
+++ b/tests/unit/utils/responses/test_responses_store.py
@@ -67,6 +67,9 @@ async def test_responses_store_pagination_basic():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test 1: First page with limit=2, descending order (default)
         result = await store.list_responses(limit=2, order=Order.desc)
         assert len(result.data) == 2
@@ -110,6 +113,9 @@ async def test_responses_store_pagination_ascending():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test ascending order pagination
         result = await store.list_responses(limit=1, order=Order.asc)
         assert len(result.data) == 1
@@ -145,6 +151,9 @@ async def test_responses_store_pagination_with_model_filter():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test pagination with model filter
         result = await store.list_responses(limit=1, model="model-a", order=Order.desc)
         assert len(result.data) == 1
@@ -192,6 +201,9 @@ async def test_responses_store_pagination_no_limit():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test without limit (should use default of 50)
         result = await store.list_responses(order=Order.desc)
         assert len(result.data) == 2
@@ -212,6 +224,9 @@ async def test_responses_store_get_response_object():
         input_list = [create_test_response_input("Test input content", "input-test-resp")]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Retrieve the response
         retrieved = await store.get_response_object("test-resp")
         assert retrieved.id == "test-resp"
@@ -242,6 +257,9 @@ async def test_responses_store_input_items_pagination():
         ]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Verify all items are stored correctly with explicit IDs
         all_items = await store.list_response_input_items("test-resp", order=Order.desc)
         assert len(all_items.data) == 5
@@ -319,6 +337,9 @@ async def test_responses_store_input_items_before_pagination():
         ]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test before pagination with descending order
         # In desc order: [Fifth, Fourth, Third, Second, First]
         # before="before-3" should return [Fifth, Fourth]

From b93b7798adb380db33a8f0eeb5f742b279339e26 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Fri, 19 Sep 2025 15:53:26 -0700
Subject: [PATCH 03/33] chore: introduce write queue for response_store

# What does this PR do?


## Test Plan
---
 llama_stack/core/datatypes.py                 |   6 ++
 .../utils/responses/responses_store.py        | 102 ++++++++++++++++--
 .../meta_reference/test_openai_responses.py   |   2 +-
 .../utils/responses/test_responses_store.py   |  21 ++++
 4 files changed, 124 insertions(+), 7 deletions(-)

diff --git a/llama_stack/core/datatypes.py b/llama_stack/core/datatypes.py
index b5558c66f..6a297f012 100644
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@@ -433,6 +433,12 @@ class InferenceStoreConfig(BaseModel):
     num_writers: int = Field(default=4, description="Number of concurrent background writers")
 
 
+class ResponsesStoreConfig(BaseModel):
+    sql_store_config: SqlStoreConfig
+    max_write_queue_size: int = Field(default=10000, description="Max queued writes for responses store")
+    num_writers: int = Field(default=4, description="Number of concurrent background writers")
+
+
 class StackRunConfig(BaseModel):
     version: int = LLAMA_STACK_RUN_CONFIG_VERSION
 
diff --git a/llama_stack/providers/utils/responses/responses_store.py b/llama_stack/providers/utils/responses/responses_store.py
index 04778ed1c..367b8aa94 100644
--- a/llama_stack/providers/utils/responses/responses_store.py
+++ b/llama_stack/providers/utils/responses/responses_store.py
@@ -3,6 +3,9 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import asyncio
+from typing import Any
+
 from llama_stack.apis.agents import (
     Order,
 )
@@ -14,25 +17,51 @@ from llama_stack.apis.agents.openai_responses import (
     OpenAIResponseObject,
     OpenAIResponseObjectWithInput,
 )
-from llama_stack.core.datatypes import AccessRule
+from llama_stack.core.datatypes import AccessRule, ResponsesStoreConfig
 from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR
+from llama_stack.log import get_logger
 
 from ..sqlstore.api import ColumnDefinition, ColumnType
 from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
-from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl
+from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, SqlStoreType, sqlstore_impl
+
+logger = get_logger(name=__name__, category="responses_store")
 
 
 class ResponsesStore:
-    def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]):
-        if not sql_store_config:
-            sql_store_config = SqliteSqlStoreConfig(
+    def __init__(
+        self,
+        config: ResponsesStoreConfig | SqlStoreConfig,
+        policy: list[AccessRule],
+    ):
+        # Handle backward compatibility
+        if not isinstance(config, ResponsesStoreConfig):
+            # Legacy: SqlStoreConfig passed directly as config
+            config = ResponsesStoreConfig(
+                sql_store_config=config,
+            )
+
+        self.config = config
+        self.sql_store_config = config.sql_store_config
+        if not self.sql_store_config:
+            self.sql_store_config = SqliteSqlStoreConfig(
                 db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
             )
-        self.sql_store = AuthorizedSqlStore(sqlstore_impl(sql_store_config))
+        self.sql_store = None
         self.policy = policy
 
+        # Disable write queue for SQLite to avoid concurrency issues
+        self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite
+
+        # Async write queue and worker control
+        self._queue: asyncio.Queue[tuple[OpenAIResponseObject, list[OpenAIResponseInput]]] | None = None
+        self._worker_tasks: list[asyncio.Task[Any]] = []
+        self._max_write_queue_size: int = config.max_write_queue_size
+        self._num_writers: int = max(1, config.num_writers)
+
     async def initialize(self):
         """Create the necessary tables if they don't exist."""
+        self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config))
         await self.sql_store.create_table(
             "openai_responses",
             {
@@ -43,9 +72,70 @@ class ResponsesStore:
             },
         )
 
+        if self.enable_write_queue:
+            self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
+            for _ in range(self._num_writers):
+                self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
+        else:
+            logger.info("Write queue disabled for SQLite to avoid concurrency issues")
+
+    async def shutdown(self) -> None:
+        if not self._worker_tasks:
+            return
+        if self._queue is not None:
+            await self._queue.join()
+        for t in self._worker_tasks:
+            if not t.done():
+                t.cancel()
+        for t in self._worker_tasks:
+            try:
+                await t
+            except asyncio.CancelledError:
+                pass
+        self._worker_tasks.clear()
+
+    async def flush(self) -> None:
+        """Wait for all queued writes to complete. Useful for testing."""
+        if self.enable_write_queue and self._queue is not None:
+            await self._queue.join()
+
     async def store_response_object(
         self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
     ) -> None:
+        if self.enable_write_queue:
+            if self._queue is None:
+                raise ValueError("Responses store is not initialized")
+            try:
+                self._queue.put_nowait((response_object, input))
+            except asyncio.QueueFull:
+                logger.warning(
+                    f"Write queue full; adding response id={getattr(response_object, 'id', '<unknown>')}"
+                )
+                await self._queue.put((response_object, input))
+        else:
+            await self._write_response_object(response_object, input)
+
+    async def _worker_loop(self) -> None:
+        assert self._queue is not None
+        while True:
+            try:
+                item = await self._queue.get()
+            except asyncio.CancelledError:
+                break
+            response_object, input = item
+            try:
+                await self._write_response_object(response_object, input)
+            except Exception as e:  # noqa: BLE001
+                logger.error(f"Error writing response object: {e}")
+            finally:
+                self._queue.task_done()
+
+    async def _write_response_object(
+        self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
+    ) -> None:
+        if self.sql_store is None:
+            raise ValueError("Responses store is not initialized")
+
         data = response_object.model_dump()
         data["input"] = [input_item.model_dump() for input_item in input]
 
diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
index a964bc219..fd128f585 100644
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@@ -677,7 +677,7 @@ async def test_responses_store_list_input_items_logic():
 
     # Create mock store and response store
     mock_sql_store = AsyncMock()
-    responses_store = ResponsesStore(sql_store_config=None, policy=default_policy())
+    responses_store = ResponsesStore(None, policy=default_policy())
     responses_store.sql_store = mock_sql_store
 
     # Setup test data - multiple input items
diff --git a/tests/unit/utils/responses/test_responses_store.py b/tests/unit/utils/responses/test_responses_store.py
index 44d4b30da..4e5256c1b 100644
--- a/tests/unit/utils/responses/test_responses_store.py
+++ b/tests/unit/utils/responses/test_responses_store.py
@@ -67,6 +67,9 @@ async def test_responses_store_pagination_basic():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test 1: First page with limit=2, descending order (default)
         result = await store.list_responses(limit=2, order=Order.desc)
         assert len(result.data) == 2
@@ -110,6 +113,9 @@ async def test_responses_store_pagination_ascending():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test ascending order pagination
         result = await store.list_responses(limit=1, order=Order.asc)
         assert len(result.data) == 1
@@ -145,6 +151,9 @@ async def test_responses_store_pagination_with_model_filter():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test pagination with model filter
         result = await store.list_responses(limit=1, model="model-a", order=Order.desc)
         assert len(result.data) == 1
@@ -192,6 +201,9 @@ async def test_responses_store_pagination_no_limit():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test without limit (should use default of 50)
         result = await store.list_responses(order=Order.desc)
         assert len(result.data) == 2
@@ -212,6 +224,9 @@ async def test_responses_store_get_response_object():
         input_list = [create_test_response_input("Test input content", "input-test-resp")]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Retrieve the response
         retrieved = await store.get_response_object("test-resp")
         assert retrieved.id == "test-resp"
@@ -242,6 +257,9 @@ async def test_responses_store_input_items_pagination():
         ]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Verify all items are stored correctly with explicit IDs
         all_items = await store.list_response_input_items("test-resp", order=Order.desc)
         assert len(all_items.data) == 5
@@ -319,6 +337,9 @@ async def test_responses_store_input_items_before_pagination():
         ]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test before pagination with descending order
         # In desc order: [Fifth, Fourth, Third, Second, First]
         # before="before-3" should return [Fifth, Fourth]

From b0115674a4e0100c1449720e9e5e8fa4c0fc5f46 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Fri, 19 Sep 2025 15:59:36 -0700
Subject: [PATCH 04/33] chore: introduce write queue for response_store

# What does this PR do?


## Test Plan
---
 llama_stack/core/datatypes.py                 |   6 ++
 .../utils/responses/responses_store.py        | 102 ++++++++++++++++--
 .../meta_reference/test_openai_responses.py   |   3 +-
 .../utils/responses/test_responses_store.py   |  21 ++++
 4 files changed, 125 insertions(+), 7 deletions(-)

diff --git a/llama_stack/core/datatypes.py b/llama_stack/core/datatypes.py
index b5558c66f..6a297f012 100644
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@@ -433,6 +433,12 @@ class InferenceStoreConfig(BaseModel):
     num_writers: int = Field(default=4, description="Number of concurrent background writers")
 
 
+class ResponsesStoreConfig(BaseModel):
+    sql_store_config: SqlStoreConfig
+    max_write_queue_size: int = Field(default=10000, description="Max queued writes for responses store")
+    num_writers: int = Field(default=4, description="Number of concurrent background writers")
+
+
 class StackRunConfig(BaseModel):
     version: int = LLAMA_STACK_RUN_CONFIG_VERSION
 
diff --git a/llama_stack/providers/utils/responses/responses_store.py b/llama_stack/providers/utils/responses/responses_store.py
index 04778ed1c..367b8aa94 100644
--- a/llama_stack/providers/utils/responses/responses_store.py
+++ b/llama_stack/providers/utils/responses/responses_store.py
@@ -3,6 +3,9 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import asyncio
+from typing import Any
+
 from llama_stack.apis.agents import (
     Order,
 )
@@ -14,25 +17,51 @@ from llama_stack.apis.agents.openai_responses import (
     OpenAIResponseObject,
     OpenAIResponseObjectWithInput,
 )
-from llama_stack.core.datatypes import AccessRule
+from llama_stack.core.datatypes import AccessRule, ResponsesStoreConfig
 from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR
+from llama_stack.log import get_logger
 
 from ..sqlstore.api import ColumnDefinition, ColumnType
 from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
-from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl
+from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, SqlStoreType, sqlstore_impl
+
+logger = get_logger(name=__name__, category="responses_store")
 
 
 class ResponsesStore:
-    def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]):
-        if not sql_store_config:
-            sql_store_config = SqliteSqlStoreConfig(
+    def __init__(
+        self,
+        config: ResponsesStoreConfig | SqlStoreConfig,
+        policy: list[AccessRule],
+    ):
+        # Handle backward compatibility
+        if not isinstance(config, ResponsesStoreConfig):
+            # Legacy: SqlStoreConfig passed directly as config
+            config = ResponsesStoreConfig(
+                sql_store_config=config,
+            )
+
+        self.config = config
+        self.sql_store_config = config.sql_store_config
+        if not self.sql_store_config:
+            self.sql_store_config = SqliteSqlStoreConfig(
                 db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
             )
-        self.sql_store = AuthorizedSqlStore(sqlstore_impl(sql_store_config))
+        self.sql_store = None
         self.policy = policy
 
+        # Disable write queue for SQLite to avoid concurrency issues
+        self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite
+
+        # Async write queue and worker control
+        self._queue: asyncio.Queue[tuple[OpenAIResponseObject, list[OpenAIResponseInput]]] | None = None
+        self._worker_tasks: list[asyncio.Task[Any]] = []
+        self._max_write_queue_size: int = config.max_write_queue_size
+        self._num_writers: int = max(1, config.num_writers)
+
     async def initialize(self):
         """Create the necessary tables if they don't exist."""
+        self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config))
         await self.sql_store.create_table(
             "openai_responses",
             {
@@ -43,9 +72,70 @@ class ResponsesStore:
             },
         )
 
+        if self.enable_write_queue:
+            self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
+            for _ in range(self._num_writers):
+                self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
+        else:
+            logger.info("Write queue disabled for SQLite to avoid concurrency issues")
+
+    async def shutdown(self) -> None:
+        if not self._worker_tasks:
+            return
+        if self._queue is not None:
+            await self._queue.join()
+        for t in self._worker_tasks:
+            if not t.done():
+                t.cancel()
+        for t in self._worker_tasks:
+            try:
+                await t
+            except asyncio.CancelledError:
+                pass
+        self._worker_tasks.clear()
+
+    async def flush(self) -> None:
+        """Wait for all queued writes to complete. Useful for testing."""
+        if self.enable_write_queue and self._queue is not None:
+            await self._queue.join()
+
     async def store_response_object(
         self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
     ) -> None:
+        if self.enable_write_queue:
+            if self._queue is None:
+                raise ValueError("Responses store is not initialized")
+            try:
+                self._queue.put_nowait((response_object, input))
+            except asyncio.QueueFull:
+                logger.warning(
+                    f"Write queue full; adding response id={getattr(response_object, 'id', '<unknown>')}"
+                )
+                await self._queue.put((response_object, input))
+        else:
+            await self._write_response_object(response_object, input)
+
+    async def _worker_loop(self) -> None:
+        assert self._queue is not None
+        while True:
+            try:
+                item = await self._queue.get()
+            except asyncio.CancelledError:
+                break
+            response_object, input = item
+            try:
+                await self._write_response_object(response_object, input)
+            except Exception as e:  # noqa: BLE001
+                logger.error(f"Error writing response object: {e}")
+            finally:
+                self._queue.task_done()
+
+    async def _write_response_object(
+        self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
+    ) -> None:
+        if self.sql_store is None:
+            raise ValueError("Responses store is not initialized")
+
         data = response_object.model_dump()
         data["input"] = [input_item.model_dump() for input_item in input]
 
diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
index a964bc219..df89986af 100644
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@@ -42,6 +42,7 @@ from llama_stack.apis.inference import (
 )
 from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime
 from llama_stack.core.access_control.access_control import default_policy
+from llama_stack.core.datatypes import ResponsesStoreConfig
 from llama_stack.providers.inline.agents.meta_reference.responses.openai_responses import (
     OpenAIResponsesImpl,
 )
@@ -677,7 +678,7 @@ async def test_responses_store_list_input_items_logic():
 
     # Create mock store and response store
     mock_sql_store = AsyncMock()
-    responses_store = ResponsesStore(sql_store_config=None, policy=default_policy())
+    responses_store = ResponsesStore(ResponsesStoreConfig(), policy=default_policy())
     responses_store.sql_store = mock_sql_store
 
     # Setup test data - multiple input items
diff --git a/tests/unit/utils/responses/test_responses_store.py b/tests/unit/utils/responses/test_responses_store.py
index 44d4b30da..4e5256c1b 100644
--- a/tests/unit/utils/responses/test_responses_store.py
+++ b/tests/unit/utils/responses/test_responses_store.py
@@ -67,6 +67,9 @@ async def test_responses_store_pagination_basic():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test 1: First page with limit=2, descending order (default)
         result = await store.list_responses(limit=2, order=Order.desc)
         assert len(result.data) == 2
@@ -110,6 +113,9 @@ async def test_responses_store_pagination_ascending():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test ascending order pagination
         result = await store.list_responses(limit=1, order=Order.asc)
         assert len(result.data) == 1
@@ -145,6 +151,9 @@ async def test_responses_store_pagination_with_model_filter():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test pagination with model filter
         result = await store.list_responses(limit=1, model="model-a", order=Order.desc)
         assert len(result.data) == 1
@@ -192,6 +201,9 @@ async def test_responses_store_pagination_no_limit():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test without limit (should use default of 50)
         result = await store.list_responses(order=Order.desc)
         assert len(result.data) == 2
@@ -212,6 +224,9 @@ async def test_responses_store_get_response_object():
         input_list = [create_test_response_input("Test input content", "input-test-resp")]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Retrieve the response
         retrieved = await store.get_response_object("test-resp")
         assert retrieved.id == "test-resp"
@@ -242,6 +257,9 @@ async def test_responses_store_input_items_pagination():
         ]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Verify all items are stored correctly with explicit IDs
         all_items = await store.list_response_input_items("test-resp", order=Order.desc)
         assert len(all_items.data) == 5
@@ -319,6 +337,9 @@ async def test_responses_store_input_items_before_pagination():
         ]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test before pagination with descending order
         # In desc order: [Fifth, Fourth, Third, Second, First]
         # before="before-3" should return [Fifth, Fourth]

From 7660ba844f64d01d40bbf9631f309acb05067cfd Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Fri, 19 Sep 2025 16:02:02 -0700
Subject: [PATCH 05/33] chore: introduce write queue for response_store

# What does this PR do?


## Test Plan
---
 llama_stack/core/datatypes.py                 |   6 ++
 .../utils/responses/responses_store.py        | 102 ++++++++++++++++--
 .../meta_reference/test_openai_responses.py   |   6 +-
 .../utils/responses/test_responses_store.py   |  21 ++++
 4 files changed, 128 insertions(+), 7 deletions(-)

diff --git a/llama_stack/core/datatypes.py b/llama_stack/core/datatypes.py
index b5558c66f..6a297f012 100644
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@@ -433,6 +433,12 @@ class InferenceStoreConfig(BaseModel):
     num_writers: int = Field(default=4, description="Number of concurrent background writers")
 
 
+class ResponsesStoreConfig(BaseModel):
+    sql_store_config: SqlStoreConfig
+    max_write_queue_size: int = Field(default=10000, description="Max queued writes for responses store")
+    num_writers: int = Field(default=4, description="Number of concurrent background writers")
+
+
 class StackRunConfig(BaseModel):
     version: int = LLAMA_STACK_RUN_CONFIG_VERSION
 
diff --git a/llama_stack/providers/utils/responses/responses_store.py b/llama_stack/providers/utils/responses/responses_store.py
index 04778ed1c..367b8aa94 100644
--- a/llama_stack/providers/utils/responses/responses_store.py
+++ b/llama_stack/providers/utils/responses/responses_store.py
@@ -3,6 +3,9 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import asyncio
+from typing import Any
+
 from llama_stack.apis.agents import (
     Order,
 )
@@ -14,25 +17,51 @@ from llama_stack.apis.agents.openai_responses import (
     OpenAIResponseObject,
     OpenAIResponseObjectWithInput,
 )
-from llama_stack.core.datatypes import AccessRule
+from llama_stack.core.datatypes import AccessRule, ResponsesStoreConfig
 from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR
+from llama_stack.log import get_logger
 
 from ..sqlstore.api import ColumnDefinition, ColumnType
 from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
-from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl
+from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, SqlStoreType, sqlstore_impl
+
+logger = get_logger(name=__name__, category="responses_store")
 
 
 class ResponsesStore:
-    def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]):
-        if not sql_store_config:
-            sql_store_config = SqliteSqlStoreConfig(
+    def __init__(
+        self,
+        config: ResponsesStoreConfig | SqlStoreConfig,
+        policy: list[AccessRule],
+    ):
+        # Handle backward compatibility
+        if not isinstance(config, ResponsesStoreConfig):
+            # Legacy: SqlStoreConfig passed directly as config
+            config = ResponsesStoreConfig(
+                sql_store_config=config,
+            )
+
+        self.config = config
+        self.sql_store_config = config.sql_store_config
+        if not self.sql_store_config:
+            self.sql_store_config = SqliteSqlStoreConfig(
                 db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
             )
-        self.sql_store = AuthorizedSqlStore(sqlstore_impl(sql_store_config))
+        self.sql_store = None
         self.policy = policy
 
+        # Disable write queue for SQLite to avoid concurrency issues
+        self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite
+
+        # Async write queue and worker control
+        self._queue: asyncio.Queue[tuple[OpenAIResponseObject, list[OpenAIResponseInput]]] | None = None
+        self._worker_tasks: list[asyncio.Task[Any]] = []
+        self._max_write_queue_size: int = config.max_write_queue_size
+        self._num_writers: int = max(1, config.num_writers)
+
     async def initialize(self):
         """Create the necessary tables if they don't exist."""
+        self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config))
         await self.sql_store.create_table(
             "openai_responses",
             {
@@ -43,9 +72,70 @@ class ResponsesStore:
             },
         )
 
+        if self.enable_write_queue:
+            self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
+            for _ in range(self._num_writers):
+                self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
+        else:
+            logger.info("Write queue disabled for SQLite to avoid concurrency issues")
+
+    async def shutdown(self) -> None:
+        if not self._worker_tasks:
+            return
+        if self._queue is not None:
+            await self._queue.join()
+        for t in self._worker_tasks:
+            if not t.done():
+                t.cancel()
+        for t in self._worker_tasks:
+            try:
+                await t
+            except asyncio.CancelledError:
+                pass
+        self._worker_tasks.clear()
+
+    async def flush(self) -> None:
+        """Wait for all queued writes to complete. Useful for testing."""
+        if self.enable_write_queue and self._queue is not None:
+            await self._queue.join()
+
     async def store_response_object(
         self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
     ) -> None:
+        if self.enable_write_queue:
+            if self._queue is None:
+                raise ValueError("Responses store is not initialized")
+            try:
+                self._queue.put_nowait((response_object, input))
+            except asyncio.QueueFull:
+                logger.warning(
+                    f"Write queue full; adding response id={getattr(response_object, 'id', '<unknown>')}"
+                )
+                await self._queue.put((response_object, input))
+        else:
+            await self._write_response_object(response_object, input)
+
+    async def _worker_loop(self) -> None:
+        assert self._queue is not None
+        while True:
+            try:
+                item = await self._queue.get()
+            except asyncio.CancelledError:
+                break
+            response_object, input = item
+            try:
+                await self._write_response_object(response_object, input)
+            except Exception as e:  # noqa: BLE001
+                logger.error(f"Error writing response object: {e}")
+            finally:
+                self._queue.task_done()
+
+    async def _write_response_object(
+        self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
+    ) -> None:
+        if self.sql_store is None:
+            raise ValueError("Responses store is not initialized")
+
         data = response_object.model_dump()
         data["input"] = [input_item.model_dump() for input_item in input]
 
diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
index a964bc219..e467e910d 100644
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@@ -42,10 +42,12 @@ from llama_stack.apis.inference import (
 )
 from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime
 from llama_stack.core.access_control.access_control import default_policy
+from llama_stack.core.datatypes import ResponsesStoreConfig
 from llama_stack.providers.inline.agents.meta_reference.responses.openai_responses import (
     OpenAIResponsesImpl,
 )
 from llama_stack.providers.utils.responses.responses_store import ResponsesStore
+from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
 from tests.unit.providers.agents.meta_reference.fixtures import load_chat_completion_fixture
 
 
@@ -677,7 +679,9 @@ async def test_responses_store_list_input_items_logic():
 
     # Create mock store and response store
     mock_sql_store = AsyncMock()
-    responses_store = ResponsesStore(sql_store_config=None, policy=default_policy())
+    responses_store = ResponsesStore(
+        ResponsesStoreConfig(SqliteSqlStoreConfig(db_path="mock_db_path")), policy=default_policy()
+    )
     responses_store.sql_store = mock_sql_store
 
     # Setup test data - multiple input items
diff --git a/tests/unit/utils/responses/test_responses_store.py b/tests/unit/utils/responses/test_responses_store.py
index 44d4b30da..4e5256c1b 100644
--- a/tests/unit/utils/responses/test_responses_store.py
+++ b/tests/unit/utils/responses/test_responses_store.py
@@ -67,6 +67,9 @@ async def test_responses_store_pagination_basic():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test 1: First page with limit=2, descending order (default)
         result = await store.list_responses(limit=2, order=Order.desc)
         assert len(result.data) == 2
@@ -110,6 +113,9 @@ async def test_responses_store_pagination_ascending():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test ascending order pagination
         result = await store.list_responses(limit=1, order=Order.asc)
         assert len(result.data) == 1
@@ -145,6 +151,9 @@ async def test_responses_store_pagination_with_model_filter():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test pagination with model filter
         result = await store.list_responses(limit=1, model="model-a", order=Order.desc)
         assert len(result.data) == 1
@@ -192,6 +201,9 @@ async def test_responses_store_pagination_no_limit():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test without limit (should use default of 50)
         result = await store.list_responses(order=Order.desc)
         assert len(result.data) == 2
@@ -212,6 +224,9 @@ async def test_responses_store_get_response_object():
         input_list = [create_test_response_input("Test input content", "input-test-resp")]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Retrieve the response
         retrieved = await store.get_response_object("test-resp")
         assert retrieved.id == "test-resp"
@@ -242,6 +257,9 @@ async def test_responses_store_input_items_pagination():
         ]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Verify all items are stored correctly with explicit IDs
         all_items = await store.list_response_input_items("test-resp", order=Order.desc)
         assert len(all_items.data) == 5
@@ -319,6 +337,9 @@ async def test_responses_store_input_items_before_pagination():
         ]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test before pagination with descending order
         # In desc order: [Fifth, Fourth, Third, Second, First]
         # before="before-3" should return [Fifth, Fourth]

From 04fd837d2fc26a26e1655c4ba80cc3652eab3d2b Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Fri, 19 Sep 2025 16:13:43 -0700
Subject: [PATCH 06/33] chore: introduce write queue for response_store

# What does this PR do?


## Test Plan
---
 llama_stack/core/datatypes.py                 |   6 ++
 .../utils/responses/responses_store.py        | 102 ++++++++++++++++--
 .../meta_reference/test_openai_responses.py   |   7 +-
 .../utils/responses/test_responses_store.py   |  21 ++++
 4 files changed, 129 insertions(+), 7 deletions(-)

diff --git a/llama_stack/core/datatypes.py b/llama_stack/core/datatypes.py
index b5558c66f..6a297f012 100644
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@@ -433,6 +433,12 @@ class InferenceStoreConfig(BaseModel):
     num_writers: int = Field(default=4, description="Number of concurrent background writers")
 
 
+class ResponsesStoreConfig(BaseModel):
+    sql_store_config: SqlStoreConfig
+    max_write_queue_size: int = Field(default=10000, description="Max queued writes for responses store")
+    num_writers: int = Field(default=4, description="Number of concurrent background writers")
+
+
 class StackRunConfig(BaseModel):
     version: int = LLAMA_STACK_RUN_CONFIG_VERSION
 
diff --git a/llama_stack/providers/utils/responses/responses_store.py b/llama_stack/providers/utils/responses/responses_store.py
index 04778ed1c..367b8aa94 100644
--- a/llama_stack/providers/utils/responses/responses_store.py
+++ b/llama_stack/providers/utils/responses/responses_store.py
@@ -3,6 +3,9 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import asyncio
+from typing import Any
+
 from llama_stack.apis.agents import (
     Order,
 )
@@ -14,25 +17,51 @@ from llama_stack.apis.agents.openai_responses import (
     OpenAIResponseObject,
     OpenAIResponseObjectWithInput,
 )
-from llama_stack.core.datatypes import AccessRule
+from llama_stack.core.datatypes import AccessRule, ResponsesStoreConfig
 from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR
+from llama_stack.log import get_logger
 
 from ..sqlstore.api import ColumnDefinition, ColumnType
 from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
-from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl
+from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, SqlStoreType, sqlstore_impl
+
+logger = get_logger(name=__name__, category="responses_store")
 
 
 class ResponsesStore:
-    def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]):
-        if not sql_store_config:
-            sql_store_config = SqliteSqlStoreConfig(
+    def __init__(
+        self,
+        config: ResponsesStoreConfig | SqlStoreConfig,
+        policy: list[AccessRule],
+    ):
+        # Handle backward compatibility
+        if not isinstance(config, ResponsesStoreConfig):
+            # Legacy: SqlStoreConfig passed directly as config
+            config = ResponsesStoreConfig(
+                sql_store_config=config,
+            )
+
+        self.config = config
+        self.sql_store_config = config.sql_store_config
+        if not self.sql_store_config:
+            self.sql_store_config = SqliteSqlStoreConfig(
                 db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
             )
-        self.sql_store = AuthorizedSqlStore(sqlstore_impl(sql_store_config))
+        self.sql_store = None
         self.policy = policy
 
+        # Disable write queue for SQLite to avoid concurrency issues
+        self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite
+
+        # Async write queue and worker control
+        self._queue: asyncio.Queue[tuple[OpenAIResponseObject, list[OpenAIResponseInput]]] | None = None
+        self._worker_tasks: list[asyncio.Task[Any]] = []
+        self._max_write_queue_size: int = config.max_write_queue_size
+        self._num_writers: int = max(1, config.num_writers)
+
     async def initialize(self):
         """Create the necessary tables if they don't exist."""
+        self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config))
         await self.sql_store.create_table(
             "openai_responses",
             {
@@ -43,9 +72,70 @@ class ResponsesStore:
             },
         )
 
+        if self.enable_write_queue:
+            self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
+            for _ in range(self._num_writers):
+                self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
+        else:
+            logger.info("Write queue disabled for SQLite to avoid concurrency issues")
+
+    async def shutdown(self) -> None:
+        if not self._worker_tasks:
+            return
+        if self._queue is not None:
+            await self._queue.join()
+        for t in self._worker_tasks:
+            if not t.done():
+                t.cancel()
+        for t in self._worker_tasks:
+            try:
+                await t
+            except asyncio.CancelledError:
+                pass
+        self._worker_tasks.clear()
+
+    async def flush(self) -> None:
+        """Wait for all queued writes to complete. Useful for testing."""
+        if self.enable_write_queue and self._queue is not None:
+            await self._queue.join()
+
     async def store_response_object(
         self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
     ) -> None:
+        if self.enable_write_queue:
+            if self._queue is None:
+                raise ValueError("Responses store is not initialized")
+            try:
+                self._queue.put_nowait((response_object, input))
+            except asyncio.QueueFull:
+                logger.warning(
+                    f"Write queue full; adding response id={getattr(response_object, 'id', '<unknown>')}"
+                )
+                await self._queue.put((response_object, input))
+        else:
+            await self._write_response_object(response_object, input)
+
+    async def _worker_loop(self) -> None:
+        assert self._queue is not None
+        while True:
+            try:
+                item = await self._queue.get()
+            except asyncio.CancelledError:
+                break
+            response_object, input = item
+            try:
+                await self._write_response_object(response_object, input)
+            except Exception as e:  # noqa: BLE001
+                logger.error(f"Error writing response object: {e}")
+            finally:
+                self._queue.task_done()
+
+    async def _write_response_object(
+        self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
+    ) -> None:
+        if self.sql_store is None:
+            raise ValueError("Responses store is not initialized")
+
         data = response_object.model_dump()
         data["input"] = [input_item.model_dump() for input_item in input]
 
diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
index a964bc219..67ab87504 100644
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@@ -42,10 +42,12 @@ from llama_stack.apis.inference import (
 )
 from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime
 from llama_stack.core.access_control.access_control import default_policy
+from llama_stack.core.datatypes import ResponsesStoreConfig
 from llama_stack.providers.inline.agents.meta_reference.responses.openai_responses import (
     OpenAIResponsesImpl,
 )
 from llama_stack.providers.utils.responses.responses_store import ResponsesStore
+from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
 from tests.unit.providers.agents.meta_reference.fixtures import load_chat_completion_fixture
 
 
@@ -677,7 +679,10 @@ async def test_responses_store_list_input_items_logic():
 
     # Create mock store and response store
     mock_sql_store = AsyncMock()
-    responses_store = ResponsesStore(sql_store_config=None, policy=default_policy())
+    responses_store = ResponsesStore(
+        ResponsesStoreConfig(sql_store_config=SqliteSqlStoreConfig(db_path="mock_db_path")),
+        policy=default_policy()
+    )
     responses_store.sql_store = mock_sql_store
 
     # Setup test data - multiple input items
diff --git a/tests/unit/utils/responses/test_responses_store.py b/tests/unit/utils/responses/test_responses_store.py
index 44d4b30da..4e5256c1b 100644
--- a/tests/unit/utils/responses/test_responses_store.py
+++ b/tests/unit/utils/responses/test_responses_store.py
@@ -67,6 +67,9 @@ async def test_responses_store_pagination_basic():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test 1: First page with limit=2, descending order (default)
         result = await store.list_responses(limit=2, order=Order.desc)
         assert len(result.data) == 2
@@ -110,6 +113,9 @@ async def test_responses_store_pagination_ascending():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test ascending order pagination
         result = await store.list_responses(limit=1, order=Order.asc)
         assert len(result.data) == 1
@@ -145,6 +151,9 @@ async def test_responses_store_pagination_with_model_filter():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test pagination with model filter
         result = await store.list_responses(limit=1, model="model-a", order=Order.desc)
         assert len(result.data) == 1
@@ -192,6 +201,9 @@ async def test_responses_store_pagination_no_limit():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test without limit (should use default of 50)
         result = await store.list_responses(order=Order.desc)
         assert len(result.data) == 2
@@ -212,6 +224,9 @@ async def test_responses_store_get_response_object():
         input_list = [create_test_response_input("Test input content", "input-test-resp")]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Retrieve the response
         retrieved = await store.get_response_object("test-resp")
         assert retrieved.id == "test-resp"
@@ -242,6 +257,9 @@ async def test_responses_store_input_items_pagination():
         ]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Verify all items are stored correctly with explicit IDs
         all_items = await store.list_response_input_items("test-resp", order=Order.desc)
         assert len(all_items.data) == 5
@@ -319,6 +337,9 @@ async def test_responses_store_input_items_before_pagination():
         ]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test before pagination with descending order
         # In desc order: [Fifth, Fourth, Third, Second, First]
         # before="before-3" should return [Fifth, Fourth]

From ce9a62aa840933e83c47825a0d207da02c4fc153 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Sun, 21 Sep 2025 20:37:58 -0700
Subject: [PATCH 07/33] chore: introduce write queue for response_store

# What does this PR do?


## Test Plan
---
 llama_stack/core/datatypes.py                 |   6 ++
 .../utils/responses/responses_store.py        | 100 ++++++++++++++++--
 .../meta_reference/test_openai_responses.py   |   6 +-
 .../utils/responses/test_responses_store.py   |  21 ++++
 4 files changed, 126 insertions(+), 7 deletions(-)

diff --git a/llama_stack/core/datatypes.py b/llama_stack/core/datatypes.py
index b5558c66f..6a297f012 100644
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@@ -433,6 +433,12 @@ class InferenceStoreConfig(BaseModel):
     num_writers: int = Field(default=4, description="Number of concurrent background writers")
 
 
+class ResponsesStoreConfig(BaseModel):
+    sql_store_config: SqlStoreConfig
+    max_write_queue_size: int = Field(default=10000, description="Max queued writes for responses store")
+    num_writers: int = Field(default=4, description="Number of concurrent background writers")
+
+
 class StackRunConfig(BaseModel):
     version: int = LLAMA_STACK_RUN_CONFIG_VERSION
 
diff --git a/llama_stack/providers/utils/responses/responses_store.py b/llama_stack/providers/utils/responses/responses_store.py
index 04778ed1c..f952d0880 100644
--- a/llama_stack/providers/utils/responses/responses_store.py
+++ b/llama_stack/providers/utils/responses/responses_store.py
@@ -3,6 +3,9 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import asyncio
+from typing import Any
+
 from llama_stack.apis.agents import (
     Order,
 )
@@ -14,25 +17,51 @@ from llama_stack.apis.agents.openai_responses import (
     OpenAIResponseObject,
     OpenAIResponseObjectWithInput,
 )
-from llama_stack.core.datatypes import AccessRule
+from llama_stack.core.datatypes import AccessRule, ResponsesStoreConfig
 from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR
+from llama_stack.log import get_logger
 
 from ..sqlstore.api import ColumnDefinition, ColumnType
 from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
-from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl
+from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, SqlStoreType, sqlstore_impl
+
+logger = get_logger(name=__name__, category="responses_store")
 
 
 class ResponsesStore:
-    def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]):
-        if not sql_store_config:
-            sql_store_config = SqliteSqlStoreConfig(
+    def __init__(
+        self,
+        config: ResponsesStoreConfig | SqlStoreConfig,
+        policy: list[AccessRule],
+    ):
+        # Handle backward compatibility
+        if not isinstance(config, ResponsesStoreConfig):
+            # Legacy: SqlStoreConfig passed directly as config
+            config = ResponsesStoreConfig(
+                sql_store_config=config,
+            )
+
+        self.config = config
+        self.sql_store_config = config.sql_store_config
+        if not self.sql_store_config:
+            self.sql_store_config = SqliteSqlStoreConfig(
                 db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
             )
-        self.sql_store = AuthorizedSqlStore(sqlstore_impl(sql_store_config))
+        self.sql_store = None
         self.policy = policy
 
+        # Disable write queue for SQLite to avoid concurrency issues
+        self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite
+
+        # Async write queue and worker control
+        self._queue: asyncio.Queue[tuple[OpenAIResponseObject, list[OpenAIResponseInput]]] | None = None
+        self._worker_tasks: list[asyncio.Task[Any]] = []
+        self._max_write_queue_size: int = config.max_write_queue_size
+        self._num_writers: int = max(1, config.num_writers)
+
     async def initialize(self):
         """Create the necessary tables if they don't exist."""
+        self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config))
         await self.sql_store.create_table(
             "openai_responses",
             {
@@ -43,9 +72,68 @@ class ResponsesStore:
             },
         )
 
+        if self.enable_write_queue:
+            self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
+            for _ in range(self._num_writers):
+                self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
+        else:
+            logger.info("Write queue disabled for SQLite to avoid concurrency issues")
+
+    async def shutdown(self) -> None:
+        if not self._worker_tasks:
+            return
+        if self._queue is not None:
+            await self._queue.join()
+        for t in self._worker_tasks:
+            if not t.done():
+                t.cancel()
+        for t in self._worker_tasks:
+            try:
+                await t
+            except asyncio.CancelledError:
+                pass
+        self._worker_tasks.clear()
+
+    async def flush(self) -> None:
+        """Wait for all queued writes to complete. Useful for testing."""
+        if self.enable_write_queue and self._queue is not None:
+            await self._queue.join()
+
     async def store_response_object(
         self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
     ) -> None:
+        if self.enable_write_queue:
+            if self._queue is None:
+                raise ValueError("Responses store is not initialized")
+            try:
+                self._queue.put_nowait((response_object, input))
+            except asyncio.QueueFull:
+                logger.warning(f"Write queue full; adding response id={getattr(response_object, 'id', '<unknown>')}")
+                await self._queue.put((response_object, input))
+        else:
+            await self._write_response_object(response_object, input)
+
+    async def _worker_loop(self) -> None:
+        assert self._queue is not None
+        while True:
+            try:
+                item = await self._queue.get()
+            except asyncio.CancelledError:
+                break
+            response_object, input = item
+            try:
+                await self._write_response_object(response_object, input)
+            except Exception as e:  # noqa: BLE001
+                logger.error(f"Error writing response object: {e}")
+            finally:
+                self._queue.task_done()
+
+    async def _write_response_object(
+        self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
+    ) -> None:
+        if self.sql_store is None:
+            raise ValueError("Responses store is not initialized")
+
         data = response_object.model_dump()
         data["input"] = [input_item.model_dump() for input_item in input]
 
diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
index a964bc219..38ce365c1 100644
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@@ -42,10 +42,12 @@ from llama_stack.apis.inference import (
 )
 from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime
 from llama_stack.core.access_control.access_control import default_policy
+from llama_stack.core.datatypes import ResponsesStoreConfig
 from llama_stack.providers.inline.agents.meta_reference.responses.openai_responses import (
     OpenAIResponsesImpl,
 )
 from llama_stack.providers.utils.responses.responses_store import ResponsesStore
+from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
 from tests.unit.providers.agents.meta_reference.fixtures import load_chat_completion_fixture
 
 
@@ -677,7 +679,9 @@ async def test_responses_store_list_input_items_logic():
 
     # Create mock store and response store
     mock_sql_store = AsyncMock()
-    responses_store = ResponsesStore(sql_store_config=None, policy=default_policy())
+    responses_store = ResponsesStore(
+        ResponsesStoreConfig(sql_store_config=SqliteSqlStoreConfig(db_path="mock_db_path")), policy=default_policy()
+    )
     responses_store.sql_store = mock_sql_store
 
     # Setup test data - multiple input items
diff --git a/tests/unit/utils/responses/test_responses_store.py b/tests/unit/utils/responses/test_responses_store.py
index 44d4b30da..4e5256c1b 100644
--- a/tests/unit/utils/responses/test_responses_store.py
+++ b/tests/unit/utils/responses/test_responses_store.py
@@ -67,6 +67,9 @@ async def test_responses_store_pagination_basic():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test 1: First page with limit=2, descending order (default)
         result = await store.list_responses(limit=2, order=Order.desc)
         assert len(result.data) == 2
@@ -110,6 +113,9 @@ async def test_responses_store_pagination_ascending():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test ascending order pagination
         result = await store.list_responses(limit=1, order=Order.asc)
         assert len(result.data) == 1
@@ -145,6 +151,9 @@ async def test_responses_store_pagination_with_model_filter():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test pagination with model filter
         result = await store.list_responses(limit=1, model="model-a", order=Order.desc)
         assert len(result.data) == 1
@@ -192,6 +201,9 @@ async def test_responses_store_pagination_no_limit():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test without limit (should use default of 50)
         result = await store.list_responses(order=Order.desc)
         assert len(result.data) == 2
@@ -212,6 +224,9 @@ async def test_responses_store_get_response_object():
         input_list = [create_test_response_input("Test input content", "input-test-resp")]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Retrieve the response
         retrieved = await store.get_response_object("test-resp")
         assert retrieved.id == "test-resp"
@@ -242,6 +257,9 @@ async def test_responses_store_input_items_pagination():
         ]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Verify all items are stored correctly with explicit IDs
         all_items = await store.list_response_input_items("test-resp", order=Order.desc)
         assert len(all_items.data) == 5
@@ -319,6 +337,9 @@ async def test_responses_store_input_items_before_pagination():
         ]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test before pagination with descending order
         # In desc order: [Fifth, Fourth, Third, Second, First]
         # before="before-3" should return [Fifth, Fourth]

From c0b6c9d7179dc83da9efeef70a20260a30f64e30 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Sun, 21 Sep 2025 20:40:25 -0700
Subject: [PATCH 08/33] chore: introduce write queue for response_store

# What does this PR do?


## Test Plan
---
 llama_stack/core/datatypes.py                 |   6 ++
 .../utils/responses/responses_store.py        | 101 ++++++++++++++++--
 .../meta_reference/test_openai_responses.py   |   6 +-
 .../utils/responses/test_responses_store.py   |  21 ++++
 4 files changed, 127 insertions(+), 7 deletions(-)

diff --git a/llama_stack/core/datatypes.py b/llama_stack/core/datatypes.py
index b5558c66f..6a297f012 100644
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@@ -433,6 +433,12 @@ class InferenceStoreConfig(BaseModel):
     num_writers: int = Field(default=4, description="Number of concurrent background writers")
 
 
+class ResponsesStoreConfig(BaseModel):
+    sql_store_config: SqlStoreConfig
+    max_write_queue_size: int = Field(default=10000, description="Max queued writes for responses store")
+    num_writers: int = Field(default=4, description="Number of concurrent background writers")
+
+
 class StackRunConfig(BaseModel):
     version: int = LLAMA_STACK_RUN_CONFIG_VERSION
 
diff --git a/llama_stack/providers/utils/responses/responses_store.py b/llama_stack/providers/utils/responses/responses_store.py
index 829cd8a62..8dec807a3 100644
--- a/llama_stack/providers/utils/responses/responses_store.py
+++ b/llama_stack/providers/utils/responses/responses_store.py
@@ -3,6 +3,9 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import asyncio
+from typing import Any
+
 from llama_stack.apis.agents import (
     Order,
 )
@@ -14,24 +17,51 @@ from llama_stack.apis.agents.openai_responses import (
     OpenAIResponseObject,
     OpenAIResponseObjectWithInput,
 )
-from llama_stack.core.datatypes import AccessRule
+from llama_stack.core.datatypes import AccessRule, ResponsesStoreConfig
 from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR
+from llama_stack.log import get_logger
 
 from ..sqlstore.api import ColumnDefinition, ColumnType
 from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
-from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl
+from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, SqlStoreType, sqlstore_impl
+
+logger = get_logger(name=__name__, category="responses_store")
 
 
 class ResponsesStore:
-    def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]):
-        if not sql_store_config:
-            sql_store_config = SqliteSqlStoreConfig(
+    def __init__(
+        self,
+        config: ResponsesStoreConfig | SqlStoreConfig,
+        policy: list[AccessRule],
+    ):
+        # Handle backward compatibility
+        if not isinstance(config, ResponsesStoreConfig):
+            # Legacy: SqlStoreConfig passed directly as config
+            config = ResponsesStoreConfig(
+                sql_store_config=config,
+            )
+
+        self.config = config
+        self.sql_store_config = config.sql_store_config
+        if not self.sql_store_config:
+            self.sql_store_config = SqliteSqlStoreConfig(
                 db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
             )
-        self.sql_store = AuthorizedSqlStore(sqlstore_impl(sql_store_config), policy)
+        self.sql_store = None
+        self.policy = policy
+
+        # Disable write queue for SQLite to avoid concurrency issues
+        self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite
+
+        # Async write queue and worker control
+        self._queue: asyncio.Queue[tuple[OpenAIResponseObject, list[OpenAIResponseInput]]] | None = None
+        self._worker_tasks: list[asyncio.Task[Any]] = []
+        self._max_write_queue_size: int = config.max_write_queue_size
+        self._num_writers: int = max(1, config.num_writers)
 
     async def initialize(self):
         """Create the necessary tables if they don't exist."""
+        self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config), self.policy)
         await self.sql_store.create_table(
             "openai_responses",
             {
@@ -42,9 +72,68 @@ class ResponsesStore:
             },
         )
 
+        if self.enable_write_queue:
+            self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
+            for _ in range(self._num_writers):
+                self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
+        else:
+            logger.info("Write queue disabled for SQLite to avoid concurrency issues")
+
+    async def shutdown(self) -> None:
+        if not self._worker_tasks:
+            return
+        if self._queue is not None:
+            await self._queue.join()
+        for t in self._worker_tasks:
+            if not t.done():
+                t.cancel()
+        for t in self._worker_tasks:
+            try:
+                await t
+            except asyncio.CancelledError:
+                pass
+        self._worker_tasks.clear()
+
+    async def flush(self) -> None:
+        """Wait for all queued writes to complete. Useful for testing."""
+        if self.enable_write_queue and self._queue is not None:
+            await self._queue.join()
+
     async def store_response_object(
         self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
     ) -> None:
+        if self.enable_write_queue:
+            if self._queue is None:
+                raise ValueError("Responses store is not initialized")
+            try:
+                self._queue.put_nowait((response_object, input))
+            except asyncio.QueueFull:
+                logger.warning(f"Write queue full; adding response id={getattr(response_object, 'id', '<unknown>')}")
+                await self._queue.put((response_object, input))
+        else:
+            await self._write_response_object(response_object, input)
+
+    async def _worker_loop(self) -> None:
+        assert self._queue is not None
+        while True:
+            try:
+                item = await self._queue.get()
+            except asyncio.CancelledError:
+                break
+            response_object, input = item
+            try:
+                await self._write_response_object(response_object, input)
+            except Exception as e:  # noqa: BLE001
+                logger.error(f"Error writing response object: {e}")
+            finally:
+                self._queue.task_done()
+
+    async def _write_response_object(
+        self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
+    ) -> None:
+        if self.sql_store is None:
+            raise ValueError("Responses store is not initialized")
+
         data = response_object.model_dump()
         data["input"] = [input_item.model_dump() for input_item in input]
 
diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
index a964bc219..38ce365c1 100644
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@@ -42,10 +42,12 @@ from llama_stack.apis.inference import (
 )
 from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime
 from llama_stack.core.access_control.access_control import default_policy
+from llama_stack.core.datatypes import ResponsesStoreConfig
 from llama_stack.providers.inline.agents.meta_reference.responses.openai_responses import (
     OpenAIResponsesImpl,
 )
 from llama_stack.providers.utils.responses.responses_store import ResponsesStore
+from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
 from tests.unit.providers.agents.meta_reference.fixtures import load_chat_completion_fixture
 
 
@@ -677,7 +679,9 @@ async def test_responses_store_list_input_items_logic():
 
     # Create mock store and response store
     mock_sql_store = AsyncMock()
-    responses_store = ResponsesStore(sql_store_config=None, policy=default_policy())
+    responses_store = ResponsesStore(
+        ResponsesStoreConfig(sql_store_config=SqliteSqlStoreConfig(db_path="mock_db_path")), policy=default_policy()
+    )
     responses_store.sql_store = mock_sql_store
 
     # Setup test data - multiple input items
diff --git a/tests/unit/utils/responses/test_responses_store.py b/tests/unit/utils/responses/test_responses_store.py
index 44d4b30da..4e5256c1b 100644
--- a/tests/unit/utils/responses/test_responses_store.py
+++ b/tests/unit/utils/responses/test_responses_store.py
@@ -67,6 +67,9 @@ async def test_responses_store_pagination_basic():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test 1: First page with limit=2, descending order (default)
         result = await store.list_responses(limit=2, order=Order.desc)
         assert len(result.data) == 2
@@ -110,6 +113,9 @@ async def test_responses_store_pagination_ascending():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test ascending order pagination
         result = await store.list_responses(limit=1, order=Order.asc)
         assert len(result.data) == 1
@@ -145,6 +151,9 @@ async def test_responses_store_pagination_with_model_filter():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test pagination with model filter
         result = await store.list_responses(limit=1, model="model-a", order=Order.desc)
         assert len(result.data) == 1
@@ -192,6 +201,9 @@ async def test_responses_store_pagination_no_limit():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test without limit (should use default of 50)
         result = await store.list_responses(order=Order.desc)
         assert len(result.data) == 2
@@ -212,6 +224,9 @@ async def test_responses_store_get_response_object():
         input_list = [create_test_response_input("Test input content", "input-test-resp")]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Retrieve the response
         retrieved = await store.get_response_object("test-resp")
         assert retrieved.id == "test-resp"
@@ -242,6 +257,9 @@ async def test_responses_store_input_items_pagination():
         ]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Verify all items are stored correctly with explicit IDs
         all_items = await store.list_response_input_items("test-resp", order=Order.desc)
         assert len(all_items.data) == 5
@@ -319,6 +337,9 @@ async def test_responses_store_input_items_before_pagination():
         ]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test before pagination with descending order
         # In desc order: [Fifth, Fourth, Third, Second, First]
         # before="before-3" should return [Fifth, Fourth]

From a772f0a42dc165ee9993fc1eaebfc422e4666b85 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Sun, 21 Sep 2025 20:46:34 -0700
Subject: [PATCH 09/33] chore: introduce write queue for response_store

# What does this PR do?


## Test Plan
---
 llama_stack/core/datatypes.py                 |   6 +
 .../utils/responses/responses_store.py        | 110 +++++++++++++++++-
 .../meta_reference/test_openai_responses.py   |   6 +-
 .../utils/responses/test_responses_store.py   |  21 ++++
 4 files changed, 136 insertions(+), 7 deletions(-)

diff --git a/llama_stack/core/datatypes.py b/llama_stack/core/datatypes.py
index b5558c66f..6a297f012 100644
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@@ -433,6 +433,12 @@ class InferenceStoreConfig(BaseModel):
     num_writers: int = Field(default=4, description="Number of concurrent background writers")
 
 
+class ResponsesStoreConfig(BaseModel):
+    sql_store_config: SqlStoreConfig
+    max_write_queue_size: int = Field(default=10000, description="Max queued writes for responses store")
+    num_writers: int = Field(default=4, description="Number of concurrent background writers")
+
+
 class StackRunConfig(BaseModel):
     version: int = LLAMA_STACK_RUN_CONFIG_VERSION
 
diff --git a/llama_stack/providers/utils/responses/responses_store.py b/llama_stack/providers/utils/responses/responses_store.py
index 829cd8a62..b9fceb1ab 100644
--- a/llama_stack/providers/utils/responses/responses_store.py
+++ b/llama_stack/providers/utils/responses/responses_store.py
@@ -3,6 +3,9 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import asyncio
+from typing import Any
+
 from llama_stack.apis.agents import (
     Order,
 )
@@ -14,24 +17,51 @@ from llama_stack.apis.agents.openai_responses import (
     OpenAIResponseObject,
     OpenAIResponseObjectWithInput,
 )
-from llama_stack.core.datatypes import AccessRule
+from llama_stack.core.datatypes import AccessRule, ResponsesStoreConfig
 from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR
+from llama_stack.log import get_logger
 
 from ..sqlstore.api import ColumnDefinition, ColumnType
 from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
-from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl
+from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, SqlStoreType, sqlstore_impl
+
+logger = get_logger(name=__name__, category="responses_store")
 
 
 class ResponsesStore:
-    def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]):
-        if not sql_store_config:
-            sql_store_config = SqliteSqlStoreConfig(
+    def __init__(
+        self,
+        config: ResponsesStoreConfig | SqlStoreConfig,
+        policy: list[AccessRule],
+    ):
+        # Handle backward compatibility
+        if not isinstance(config, ResponsesStoreConfig):
+            # Legacy: SqlStoreConfig passed directly as config
+            config = ResponsesStoreConfig(
+                sql_store_config=config,
+            )
+
+        self.config = config
+        self.sql_store_config = config.sql_store_config
+        if not self.sql_store_config:
+            self.sql_store_config = SqliteSqlStoreConfig(
                 db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
             )
-        self.sql_store = AuthorizedSqlStore(sqlstore_impl(sql_store_config), policy)
+        self.sql_store = None
+        self.policy = policy
+
+        # Disable write queue for SQLite to avoid concurrency issues
+        self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite
+
+        # Async write queue and worker control
+        self._queue: asyncio.Queue[tuple[OpenAIResponseObject, list[OpenAIResponseInput]]] | None = None
+        self._worker_tasks: list[asyncio.Task[Any]] = []
+        self._max_write_queue_size: int = config.max_write_queue_size
+        self._num_writers: int = max(1, config.num_writers)
 
     async def initialize(self):
         """Create the necessary tables if they don't exist."""
+        self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config), self.policy)
         await self.sql_store.create_table(
             "openai_responses",
             {
@@ -42,9 +72,68 @@ class ResponsesStore:
             },
         )
 
+        if self.enable_write_queue:
+            self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
+            for _ in range(self._num_writers):
+                self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
+        else:
+            logger.info("Write queue disabled for SQLite to avoid concurrency issues")
+
+    async def shutdown(self) -> None:
+        if not self._worker_tasks:
+            return
+        if self._queue is not None:
+            await self._queue.join()
+        for t in self._worker_tasks:
+            if not t.done():
+                t.cancel()
+        for t in self._worker_tasks:
+            try:
+                await t
+            except asyncio.CancelledError:
+                pass
+        self._worker_tasks.clear()
+
+    async def flush(self) -> None:
+        """Wait for all queued writes to complete. Useful for testing."""
+        if self.enable_write_queue and self._queue is not None:
+            await self._queue.join()
+
     async def store_response_object(
         self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
     ) -> None:
+        if self.enable_write_queue:
+            if self._queue is None:
+                raise ValueError("Responses store is not initialized")
+            try:
+                self._queue.put_nowait((response_object, input))
+            except asyncio.QueueFull:
+                logger.warning(f"Write queue full; adding response id={getattr(response_object, 'id', '<unknown>')}")
+                await self._queue.put((response_object, input))
+        else:
+            await self._write_response_object(response_object, input)
+
+    async def _worker_loop(self) -> None:
+        assert self._queue is not None
+        while True:
+            try:
+                item = await self._queue.get()
+            except asyncio.CancelledError:
+                break
+            response_object, input = item
+            try:
+                await self._write_response_object(response_object, input)
+            except Exception as e:  # noqa: BLE001
+                logger.error(f"Error writing response object: {e}")
+            finally:
+                self._queue.task_done()
+
+    async def _write_response_object(
+        self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
+    ) -> None:
+        if self.sql_store is None:
+            raise ValueError("Responses store is not initialized")
+
         data = response_object.model_dump()
         data["input"] = [input_item.model_dump() for input_item in input]
 
@@ -73,6 +162,9 @@ class ResponsesStore:
         :param model: The model to filter by.
         :param order: The order to sort the responses by.
         """
+        if not self.sql_store:
+            raise ValueError("Responses store is not initialized")
+
         if not order:
             order = Order.desc
 
@@ -100,6 +192,9 @@ class ResponsesStore:
         """
         Get a response object with automatic access control checking.
         """
+        if not self.sql_store:
+            raise ValueError("Responses store is not initialized")
+
         row = await self.sql_store.fetch_one(
             "openai_responses",
             where={"id": response_id},
@@ -113,6 +208,9 @@ class ResponsesStore:
         return OpenAIResponseObjectWithInput(**row["response_object"])
 
     async def delete_response_object(self, response_id: str) -> OpenAIDeleteResponseObject:
+        if not self.sql_store:
+            raise ValueError("Responses store is not initialized")
+
         row = await self.sql_store.fetch_one("openai_responses", where={"id": response_id})
         if not row:
             raise ValueError(f"Response with id {response_id} not found")
diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
index a964bc219..38ce365c1 100644
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@@ -42,10 +42,12 @@ from llama_stack.apis.inference import (
 )
 from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime
 from llama_stack.core.access_control.access_control import default_policy
+from llama_stack.core.datatypes import ResponsesStoreConfig
 from llama_stack.providers.inline.agents.meta_reference.responses.openai_responses import (
     OpenAIResponsesImpl,
 )
 from llama_stack.providers.utils.responses.responses_store import ResponsesStore
+from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
 from tests.unit.providers.agents.meta_reference.fixtures import load_chat_completion_fixture
 
 
@@ -677,7 +679,9 @@ async def test_responses_store_list_input_items_logic():
 
     # Create mock store and response store
     mock_sql_store = AsyncMock()
-    responses_store = ResponsesStore(sql_store_config=None, policy=default_policy())
+    responses_store = ResponsesStore(
+        ResponsesStoreConfig(sql_store_config=SqliteSqlStoreConfig(db_path="mock_db_path")), policy=default_policy()
+    )
     responses_store.sql_store = mock_sql_store
 
     # Setup test data - multiple input items
diff --git a/tests/unit/utils/responses/test_responses_store.py b/tests/unit/utils/responses/test_responses_store.py
index 44d4b30da..4e5256c1b 100644
--- a/tests/unit/utils/responses/test_responses_store.py
+++ b/tests/unit/utils/responses/test_responses_store.py
@@ -67,6 +67,9 @@ async def test_responses_store_pagination_basic():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test 1: First page with limit=2, descending order (default)
         result = await store.list_responses(limit=2, order=Order.desc)
         assert len(result.data) == 2
@@ -110,6 +113,9 @@ async def test_responses_store_pagination_ascending():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test ascending order pagination
         result = await store.list_responses(limit=1, order=Order.asc)
         assert len(result.data) == 1
@@ -145,6 +151,9 @@ async def test_responses_store_pagination_with_model_filter():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test pagination with model filter
         result = await store.list_responses(limit=1, model="model-a", order=Order.desc)
         assert len(result.data) == 1
@@ -192,6 +201,9 @@ async def test_responses_store_pagination_no_limit():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test without limit (should use default of 50)
         result = await store.list_responses(order=Order.desc)
         assert len(result.data) == 2
@@ -212,6 +224,9 @@ async def test_responses_store_get_response_object():
         input_list = [create_test_response_input("Test input content", "input-test-resp")]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Retrieve the response
         retrieved = await store.get_response_object("test-resp")
         assert retrieved.id == "test-resp"
@@ -242,6 +257,9 @@ async def test_responses_store_input_items_pagination():
         ]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Verify all items are stored correctly with explicit IDs
         all_items = await store.list_response_input_items("test-resp", order=Order.desc)
         assert len(all_items.data) == 5
@@ -319,6 +337,9 @@ async def test_responses_store_input_items_before_pagination():
         ]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test before pagination with descending order
         # In desc order: [Fifth, Fourth, Third, Second, First]
         # before="before-3" should return [Fifth, Fourth]

From f0211ffb7004e1aec58fd88c85741317d08b46fc Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Mon, 22 Sep 2025 21:25:09 -0700
Subject: [PATCH 10/33] chore: fix build

# What does this PR do?


## Test Plan
---
 llama_stack/core/build_container.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_stack/core/build_container.sh b/llama_stack/core/build_container.sh
index 424b40a9d..29964f324 100755
--- a/llama_stack/core/build_container.sh
+++ b/llama_stack/core/build_container.sh
@@ -164,7 +164,7 @@ RUN apt-get update && apt-get install -y \
        procps psmisc lsof \
        traceroute \
        bubblewrap \
-       gcc \
+       gcc g++ \
        && rm -rf /var/lib/apt/lists/*
 
 ENV UV_SYSTEM_PYTHON=1

From 7650d2c96a122479ebe629c613d6f11fc3e8c9f7 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Mon, 22 Sep 2025 21:33:14 -0700
Subject: [PATCH 11/33] chore: fix build

# What does this PR do?


## Test Plan
---
 llama_stack/core/build_container.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_stack/core/build_container.sh b/llama_stack/core/build_container.sh
index 424b40a9d..8e47fc592 100755
--- a/llama_stack/core/build_container.sh
+++ b/llama_stack/core/build_container.sh
@@ -147,7 +147,7 @@ WORKDIR /app
 
 RUN dnf -y update && dnf install -y iputils git net-tools wget \
     vim-minimal python3.12 python3.12-pip python3.12-wheel \
-    python3.12-setuptools python3.12-devel gcc make && \
+    python3.12-setuptools python3.12-devel gcc gcc-c++ make && \
     ln -s /bin/pip3.12 /bin/pip && ln -s /bin/python3.12 /bin/python && dnf clean all
 
 ENV UV_SYSTEM_PYTHON=1
@@ -164,7 +164,7 @@ RUN apt-get update && apt-get install -y \
        procps psmisc lsof \
        traceroute \
        bubblewrap \
-       gcc \
+       gcc g++ \
        && rm -rf /var/lib/apt/lists/*
 
 ENV UV_SYSTEM_PYTHON=1

From 88ad5d6d7319667eae1c0a04d8e99349cb98c13b Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Fri, 26 Sep 2025 10:35:11 -0700
Subject: [PATCH 12/33] chore: introduce write queue for response_store

# What does this PR do?


## Test Plan
---
 llama_stack/core/datatypes.py                 |   6 +
 .../utils/responses/responses_store.py        | 110 +++++++++++++++++-
 .../meta_reference/test_openai_responses.py   |   6 +-
 .../utils/responses/test_responses_store.py   |  21 ++++
 4 files changed, 136 insertions(+), 7 deletions(-)

diff --git a/llama_stack/core/datatypes.py b/llama_stack/core/datatypes.py
index b5558c66f..6a297f012 100644
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@@ -433,6 +433,12 @@ class InferenceStoreConfig(BaseModel):
     num_writers: int = Field(default=4, description="Number of concurrent background writers")
 
 
+class ResponsesStoreConfig(BaseModel):
+    sql_store_config: SqlStoreConfig
+    max_write_queue_size: int = Field(default=10000, description="Max queued writes for responses store")
+    num_writers: int = Field(default=4, description="Number of concurrent background writers")
+
+
 class StackRunConfig(BaseModel):
     version: int = LLAMA_STACK_RUN_CONFIG_VERSION
 
diff --git a/llama_stack/providers/utils/responses/responses_store.py b/llama_stack/providers/utils/responses/responses_store.py
index 829cd8a62..b9fceb1ab 100644
--- a/llama_stack/providers/utils/responses/responses_store.py
+++ b/llama_stack/providers/utils/responses/responses_store.py
@@ -3,6 +3,9 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import asyncio
+from typing import Any
+
 from llama_stack.apis.agents import (
     Order,
 )
@@ -14,24 +17,51 @@ from llama_stack.apis.agents.openai_responses import (
     OpenAIResponseObject,
     OpenAIResponseObjectWithInput,
 )
-from llama_stack.core.datatypes import AccessRule
+from llama_stack.core.datatypes import AccessRule, ResponsesStoreConfig
 from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR
+from llama_stack.log import get_logger
 
 from ..sqlstore.api import ColumnDefinition, ColumnType
 from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
-from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl
+from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, SqlStoreType, sqlstore_impl
+
+logger = get_logger(name=__name__, category="responses_store")
 
 
 class ResponsesStore:
-    def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]):
-        if not sql_store_config:
-            sql_store_config = SqliteSqlStoreConfig(
+    def __init__(
+        self,
+        config: ResponsesStoreConfig | SqlStoreConfig,
+        policy: list[AccessRule],
+    ):
+        # Handle backward compatibility
+        if not isinstance(config, ResponsesStoreConfig):
+            # Legacy: SqlStoreConfig passed directly as config
+            config = ResponsesStoreConfig(
+                sql_store_config=config,
+            )
+
+        self.config = config
+        self.sql_store_config = config.sql_store_config
+        if not self.sql_store_config:
+            self.sql_store_config = SqliteSqlStoreConfig(
                 db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
             )
-        self.sql_store = AuthorizedSqlStore(sqlstore_impl(sql_store_config), policy)
+        self.sql_store = None
+        self.policy = policy
+
+        # Disable write queue for SQLite to avoid concurrency issues
+        self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite
+
+        # Async write queue and worker control
+        self._queue: asyncio.Queue[tuple[OpenAIResponseObject, list[OpenAIResponseInput]]] | None = None
+        self._worker_tasks: list[asyncio.Task[Any]] = []
+        self._max_write_queue_size: int = config.max_write_queue_size
+        self._num_writers: int = max(1, config.num_writers)
 
     async def initialize(self):
         """Create the necessary tables if they don't exist."""
+        self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config), self.policy)
         await self.sql_store.create_table(
             "openai_responses",
             {
@@ -42,9 +72,68 @@ class ResponsesStore:
             },
         )
 
+        if self.enable_write_queue:
+            self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
+            for _ in range(self._num_writers):
+                self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
+        else:
+            logger.info("Write queue disabled for SQLite to avoid concurrency issues")
+
+    async def shutdown(self) -> None:
+        if not self._worker_tasks:
+            return
+        if self._queue is not None:
+            await self._queue.join()
+        for t in self._worker_tasks:
+            if not t.done():
+                t.cancel()
+        for t in self._worker_tasks:
+            try:
+                await t
+            except asyncio.CancelledError:
+                pass
+        self._worker_tasks.clear()
+
+    async def flush(self) -> None:
+        """Wait for all queued writes to complete. Useful for testing."""
+        if self.enable_write_queue and self._queue is not None:
+            await self._queue.join()
+
     async def store_response_object(
         self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
     ) -> None:
+        if self.enable_write_queue:
+            if self._queue is None:
+                raise ValueError("Responses store is not initialized")
+            try:
+                self._queue.put_nowait((response_object, input))
+            except asyncio.QueueFull:
+                logger.warning(f"Write queue full; adding response id={getattr(response_object, 'id', '<unknown>')}")
+                await self._queue.put((response_object, input))
+        else:
+            await self._write_response_object(response_object, input)
+
+    async def _worker_loop(self) -> None:
+        assert self._queue is not None
+        while True:
+            try:
+                item = await self._queue.get()
+            except asyncio.CancelledError:
+                break
+            response_object, input = item
+            try:
+                await self._write_response_object(response_object, input)
+            except Exception as e:  # noqa: BLE001
+                logger.error(f"Error writing response object: {e}")
+            finally:
+                self._queue.task_done()
+
+    async def _write_response_object(
+        self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
+    ) -> None:
+        if self.sql_store is None:
+            raise ValueError("Responses store is not initialized")
+
         data = response_object.model_dump()
         data["input"] = [input_item.model_dump() for input_item in input]
 
@@ -73,6 +162,9 @@ class ResponsesStore:
         :param model: The model to filter by.
         :param order: The order to sort the responses by.
         """
+        if not self.sql_store:
+            raise ValueError("Responses store is not initialized")
+
         if not order:
             order = Order.desc
 
@@ -100,6 +192,9 @@ class ResponsesStore:
         """
         Get a response object with automatic access control checking.
         """
+        if not self.sql_store:
+            raise ValueError("Responses store is not initialized")
+
         row = await self.sql_store.fetch_one(
             "openai_responses",
             where={"id": response_id},
@@ -113,6 +208,9 @@ class ResponsesStore:
         return OpenAIResponseObjectWithInput(**row["response_object"])
 
     async def delete_response_object(self, response_id: str) -> OpenAIDeleteResponseObject:
+        if not self.sql_store:
+            raise ValueError("Responses store is not initialized")
+
         row = await self.sql_store.fetch_one("openai_responses", where={"id": response_id})
         if not row:
             raise ValueError(f"Response with id {response_id} not found")
diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
index a964bc219..38ce365c1 100644
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@@ -42,10 +42,12 @@ from llama_stack.apis.inference import (
 )
 from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime
 from llama_stack.core.access_control.access_control import default_policy
+from llama_stack.core.datatypes import ResponsesStoreConfig
 from llama_stack.providers.inline.agents.meta_reference.responses.openai_responses import (
     OpenAIResponsesImpl,
 )
 from llama_stack.providers.utils.responses.responses_store import ResponsesStore
+from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
 from tests.unit.providers.agents.meta_reference.fixtures import load_chat_completion_fixture
 
 
@@ -677,7 +679,9 @@ async def test_responses_store_list_input_items_logic():
 
     # Create mock store and response store
     mock_sql_store = AsyncMock()
-    responses_store = ResponsesStore(sql_store_config=None, policy=default_policy())
+    responses_store = ResponsesStore(
+        ResponsesStoreConfig(sql_store_config=SqliteSqlStoreConfig(db_path="mock_db_path")), policy=default_policy()
+    )
     responses_store.sql_store = mock_sql_store
 
     # Setup test data - multiple input items
diff --git a/tests/unit/utils/responses/test_responses_store.py b/tests/unit/utils/responses/test_responses_store.py
index 44d4b30da..4e5256c1b 100644
--- a/tests/unit/utils/responses/test_responses_store.py
+++ b/tests/unit/utils/responses/test_responses_store.py
@@ -67,6 +67,9 @@ async def test_responses_store_pagination_basic():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test 1: First page with limit=2, descending order (default)
         result = await store.list_responses(limit=2, order=Order.desc)
         assert len(result.data) == 2
@@ -110,6 +113,9 @@ async def test_responses_store_pagination_ascending():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test ascending order pagination
         result = await store.list_responses(limit=1, order=Order.asc)
         assert len(result.data) == 1
@@ -145,6 +151,9 @@ async def test_responses_store_pagination_with_model_filter():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test pagination with model filter
         result = await store.list_responses(limit=1, model="model-a", order=Order.desc)
         assert len(result.data) == 1
@@ -192,6 +201,9 @@ async def test_responses_store_pagination_no_limit():
             input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
             await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test without limit (should use default of 50)
         result = await store.list_responses(order=Order.desc)
         assert len(result.data) == 2
@@ -212,6 +224,9 @@ async def test_responses_store_get_response_object():
         input_list = [create_test_response_input("Test input content", "input-test-resp")]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Retrieve the response
         retrieved = await store.get_response_object("test-resp")
         assert retrieved.id == "test-resp"
@@ -242,6 +257,9 @@ async def test_responses_store_input_items_pagination():
         ]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Verify all items are stored correctly with explicit IDs
         all_items = await store.list_response_input_items("test-resp", order=Order.desc)
         assert len(all_items.data) == 5
@@ -319,6 +337,9 @@ async def test_responses_store_input_items_before_pagination():
         ]
         await store.store_response_object(response, input_list)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test before pagination with descending order
         # In desc order: [Fifth, Fourth, Third, Second, First]
         # before="before-3" should return [Fifth, Fourth]

From 7004ac27b51c7b59effe74fcfc1ec7a65db9573d Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Fri, 26 Sep 2025 14:44:17 -0700
Subject: [PATCH 13/33] chore: remove extra logging

# What does this PR do?


## Test Plan
---
 .../providers/inline/telemetry/meta_reference/telemetry.py    | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
index 9224c3792..2a4032543 100644
--- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
@@ -224,10 +224,6 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
         return _GLOBAL_STORAGE["gauges"][name]
 
     def _log_metric(self, event: MetricEvent) -> None:
-        # Always log to console if console sink is enabled (debug)
-        if TelemetrySink.CONSOLE in self.config.sinks:
-            logger.debug(f"METRIC: {event.metric}={event.value} {event.unit} {event.attributes}")
-
         # Add metric as an event to the current span
         try:
             with self._lock:

From b1cbfe99f96fae717593fe67d878804eea40d175 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Mon, 29 Sep 2025 15:52:44 -0700
Subject: [PATCH 14/33] fix: mcp tool with array type should include items

# What does this PR do?


## Test Plan
---
 .../meta_reference/responses/streaming.py     |   1 +
 .../meta_reference/responses/__init__.py      |   5 +
 .../responses/test_streaming.py               | 147 ++++++++++++++++++
 3 files changed, 153 insertions(+)
 create mode 100644 tests/unit/providers/agents/meta_reference/responses/__init__.py
 create mode 100644 tests/unit/providers/agents/meta_reference/responses/test_streaming.py

diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
index 3e69fa5cd..b6ffb1471 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -568,6 +568,7 @@ class StreamingResponseOrchestrator:
                                 description=param.description,
                                 required=param.required,
                                 default=param.default,
+                                items=param.items,
                             )
                             for param in t.parameters
                         },
diff --git a/tests/unit/providers/agents/meta_reference/responses/__init__.py b/tests/unit/providers/agents/meta_reference/responses/__init__.py
new file mode 100644
index 000000000..6f3c1df03
--- /dev/null
+++ b/tests/unit/providers/agents/meta_reference/responses/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
\ No newline at end of file
diff --git a/tests/unit/providers/agents/meta_reference/responses/test_streaming.py b/tests/unit/providers/agents/meta_reference/responses/test_streaming.py
new file mode 100644
index 000000000..5807dd17e
--- /dev/null
+++ b/tests/unit/providers/agents/meta_reference/responses/test_streaming.py
@@ -0,0 +1,147 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+Unit tests for MCP tool parameter conversion in streaming responses.
+
+This tests the fix for handling array-type parameters with 'items' field
+when converting MCP tool definitions to OpenAI format.
+"""
+
+from llama_stack.apis.tools import ToolDef, ToolParameter
+from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
+from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
+
+
+def test_mcp_tool_conversion_with_array_items():
+    """
+    Test that MCP tool parameters with array type and items field are properly converted.
+
+    This is a regression test for the bug where array parameters without 'items'
+    caused OpenAI API validation errors like:
+    "Invalid schema for function 'pods_exec': In context=('properties', 'command'),
+    array schema missing items."
+    """
+    # Create a tool parameter with array type and items specification
+    # This mimics what kubernetes-mcp-server's pods_exec tool has
+    tool_param = ToolParameter(
+        name="command",
+        parameter_type="array",
+        description="Command to execute in the pod",
+        required=True,
+        items={"type": "string"},  # This is the crucial field
+    )
+
+    # Convert to ToolDefinition format (as done in streaming.py)
+    tool_def = ToolDefinition(
+        tool_name="test_tool",
+        description="Test tool with array parameter",
+        parameters={
+            "command": ToolParamDefinition(
+                param_type=tool_param.parameter_type,
+                description=tool_param.description,
+                required=tool_param.required,
+                default=tool_param.default,
+                items=tool_param.items,  # The fix: ensure items is passed through
+            )
+        },
+    )
+
+    # Convert to OpenAI format
+    openai_tool = convert_tooldef_to_openai_tool(tool_def)
+
+    # Verify the conversion includes the items field
+    assert openai_tool["type"] == "function"
+    assert openai_tool["function"]["name"] == "test_tool"
+    assert "parameters" in openai_tool["function"]
+
+    parameters = openai_tool["function"]["parameters"]
+    assert "properties" in parameters
+    assert "command" in parameters["properties"]
+
+    command_param = parameters["properties"]["command"]
+    assert command_param["type"] == "array"
+    assert "items" in command_param, "Array parameter must have 'items' field for OpenAI API"
+    assert command_param["items"] == {"type": "string"}
+
+
+def test_mcp_tool_conversion_without_array():
+    """Test that non-array parameters work correctly without items field."""
+    tool_param = ToolParameter(
+        name="name",
+        parameter_type="string",
+        description="Name parameter",
+        required=True,
+    )
+
+    tool_def = ToolDefinition(
+        tool_name="test_tool",
+        description="Test tool with string parameter",
+        parameters={
+            "name": ToolParamDefinition(
+                param_type=tool_param.parameter_type,
+                description=tool_param.description,
+                required=tool_param.required,
+                items=tool_param.items,  # Will be None for non-array types
+            )
+        },
+    )
+
+    openai_tool = convert_tooldef_to_openai_tool(tool_def)
+
+    # Verify basic structure
+    assert openai_tool["type"] == "function"
+    parameters = openai_tool["function"]["parameters"]
+    assert "name" in parameters["properties"]
+
+    name_param = parameters["properties"]["name"]
+    assert name_param["type"] == "string"
+    # items should not be present for non-array types
+    assert "items" not in name_param or name_param.get("items") is None
+
+
+def test_mcp_tool_conversion_complex_array_items():
+    """Test array parameter with complex items schema (object type)."""
+    tool_param = ToolParameter(
+        name="configs",
+        parameter_type="array",
+        description="Array of configuration objects",
+        required=False,
+        items={
+            "type": "object",
+            "properties": {
+                "key": {"type": "string"},
+                "value": {"type": "string"},
+            },
+            "required": ["key"],
+        },
+    )
+
+    tool_def = ToolDefinition(
+        tool_name="test_tool",
+        description="Test tool with complex array parameter",
+        parameters={
+            "configs": ToolParamDefinition(
+                param_type=tool_param.parameter_type,
+                description=tool_param.description,
+                required=tool_param.required,
+                items=tool_param.items,
+            )
+        },
+    )
+
+    openai_tool = convert_tooldef_to_openai_tool(tool_def)
+
+    # Verify complex items schema is preserved
+    parameters = openai_tool["function"]["parameters"]
+    configs_param = parameters["properties"]["configs"]
+
+    assert configs_param["type"] == "array"
+    assert "items" in configs_param
+    assert configs_param["items"]["type"] == "object"
+    assert "properties" in configs_param["items"]
+    assert "key" in configs_param["items"]["properties"]
+    assert "value" in configs_param["items"]["properties"]
\ No newline at end of file

From cd1f6410ceb1f90e80f490fffab46b710b5d74b9 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Mon, 29 Sep 2025 15:53:13 -0700
Subject: [PATCH 15/33] fix: mcp tool with array type should include items

# What does this PR do?


## Test Plan
---
 .../meta_reference/responses/streaming.py     |   1 +
 .../meta_reference/responses/__init__.py      |   5 +
 .../responses/test_streaming.py               | 147 ++++++++++++++++++
 3 files changed, 153 insertions(+)
 create mode 100644 tests/unit/providers/agents/meta_reference/responses/__init__.py
 create mode 100644 tests/unit/providers/agents/meta_reference/responses/test_streaming.py

diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
index 3e69fa5cd..b6ffb1471 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -568,6 +568,7 @@ class StreamingResponseOrchestrator:
                                 description=param.description,
                                 required=param.required,
                                 default=param.default,
+                                items=param.items,
                             )
                             for param in t.parameters
                         },
diff --git a/tests/unit/providers/agents/meta_reference/responses/__init__.py b/tests/unit/providers/agents/meta_reference/responses/__init__.py
new file mode 100644
index 000000000..6f3c1df03
--- /dev/null
+++ b/tests/unit/providers/agents/meta_reference/responses/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
\ No newline at end of file
diff --git a/tests/unit/providers/agents/meta_reference/responses/test_streaming.py b/tests/unit/providers/agents/meta_reference/responses/test_streaming.py
new file mode 100644
index 000000000..f4bba613e
--- /dev/null
+++ b/tests/unit/providers/agents/meta_reference/responses/test_streaming.py
@@ -0,0 +1,147 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+Unit tests for MCP tool parameter conversion in streaming responses.
+
+This tests the fix for handling array-type parameters with 'items' field
+when converting MCP tool definitions to OpenAI format.
+"""
+
+from llama_stack.apis.tools import ToolParameter
+from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
+from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
+
+
+def test_mcp_tool_conversion_with_array_items():
+    """
+    Test that MCP tool parameters with array type and items field are properly converted.
+
+    This is a regression test for the bug where array parameters without 'items'
+    caused OpenAI API validation errors like:
+    "Invalid schema for function 'pods_exec': In context=('properties', 'command'),
+    array schema missing items."
+    """
+    # Create a tool parameter with array type and items specification
+    # This mimics what kubernetes-mcp-server's pods_exec tool has
+    tool_param = ToolParameter(
+        name="command",
+        parameter_type="array",
+        description="Command to execute in the pod",
+        required=True,
+        items={"type": "string"},  # This is the crucial field
+    )
+
+    # Convert to ToolDefinition format (as done in streaming.py)
+    tool_def = ToolDefinition(
+        tool_name="test_tool",
+        description="Test tool with array parameter",
+        parameters={
+            "command": ToolParamDefinition(
+                param_type=tool_param.parameter_type,
+                description=tool_param.description,
+                required=tool_param.required,
+                default=tool_param.default,
+                items=tool_param.items,  # The fix: ensure items is passed through
+            )
+        },
+    )
+
+    # Convert to OpenAI format
+    openai_tool = convert_tooldef_to_openai_tool(tool_def)
+
+    # Verify the conversion includes the items field
+    assert openai_tool["type"] == "function"
+    assert openai_tool["function"]["name"] == "test_tool"
+    assert "parameters" in openai_tool["function"]
+
+    parameters = openai_tool["function"]["parameters"]
+    assert "properties" in parameters
+    assert "command" in parameters["properties"]
+
+    command_param = parameters["properties"]["command"]
+    assert command_param["type"] == "array"
+    assert "items" in command_param, "Array parameter must have 'items' field for OpenAI API"
+    assert command_param["items"] == {"type": "string"}
+
+
+def test_mcp_tool_conversion_without_array():
+    """Test that non-array parameters work correctly without items field."""
+    tool_param = ToolParameter(
+        name="name",
+        parameter_type="string",
+        description="Name parameter",
+        required=True,
+    )
+
+    tool_def = ToolDefinition(
+        tool_name="test_tool",
+        description="Test tool with string parameter",
+        parameters={
+            "name": ToolParamDefinition(
+                param_type=tool_param.parameter_type,
+                description=tool_param.description,
+                required=tool_param.required,
+                items=tool_param.items,  # Will be None for non-array types
+            )
+        },
+    )
+
+    openai_tool = convert_tooldef_to_openai_tool(tool_def)
+
+    # Verify basic structure
+    assert openai_tool["type"] == "function"
+    parameters = openai_tool["function"]["parameters"]
+    assert "name" in parameters["properties"]
+
+    name_param = parameters["properties"]["name"]
+    assert name_param["type"] == "string"
+    # items should not be present for non-array types
+    assert "items" not in name_param or name_param.get("items") is None
+
+
+def test_mcp_tool_conversion_complex_array_items():
+    """Test array parameter with complex items schema (object type)."""
+    tool_param = ToolParameter(
+        name="configs",
+        parameter_type="array",
+        description="Array of configuration objects",
+        required=False,
+        items={
+            "type": "object",
+            "properties": {
+                "key": {"type": "string"},
+                "value": {"type": "string"},
+            },
+            "required": ["key"],
+        },
+    )
+
+    tool_def = ToolDefinition(
+        tool_name="test_tool",
+        description="Test tool with complex array parameter",
+        parameters={
+            "configs": ToolParamDefinition(
+                param_type=tool_param.parameter_type,
+                description=tool_param.description,
+                required=tool_param.required,
+                items=tool_param.items,
+            )
+        },
+    )
+
+    openai_tool = convert_tooldef_to_openai_tool(tool_def)
+
+    # Verify complex items schema is preserved
+    parameters = openai_tool["function"]["parameters"]
+    configs_param = parameters["properties"]["configs"]
+
+    assert configs_param["type"] == "array"
+    assert "items" in configs_param
+    assert configs_param["items"]["type"] == "object"
+    assert "properties" in configs_param["items"]
+    assert "key" in configs_param["items"]["properties"]
+    assert "value" in configs_param["items"]["properties"]

From 1b308fd87237ee3bc45431c4f7d28d36551a0d95 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Mon, 29 Sep 2025 15:53:13 -0700
Subject: [PATCH 16/33] fix: mcp tool with array type should include items

# What does this PR do?


## Test Plan
---
 .../meta_reference/responses/streaming.py     |   1 +
 .../meta_reference/responses/__init__.py      |   5 +
 .../responses/test_streaming.py               | 147 ++++++++++++++++++
 3 files changed, 153 insertions(+)
 create mode 100644 tests/unit/providers/agents/meta_reference/responses/__init__.py
 create mode 100644 tests/unit/providers/agents/meta_reference/responses/test_streaming.py

diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
index 3e69fa5cd..b6ffb1471 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -568,6 +568,7 @@ class StreamingResponseOrchestrator:
                                 description=param.description,
                                 required=param.required,
                                 default=param.default,
+                                items=param.items,
                             )
                             for param in t.parameters
                         },
diff --git a/tests/unit/providers/agents/meta_reference/responses/__init__.py b/tests/unit/providers/agents/meta_reference/responses/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/tests/unit/providers/agents/meta_reference/responses/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/tests/unit/providers/agents/meta_reference/responses/test_streaming.py b/tests/unit/providers/agents/meta_reference/responses/test_streaming.py
new file mode 100644
index 000000000..f4bba613e
--- /dev/null
+++ b/tests/unit/providers/agents/meta_reference/responses/test_streaming.py
@@ -0,0 +1,147 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+Unit tests for MCP tool parameter conversion in streaming responses.
+
+This tests the fix for handling array-type parameters with 'items' field
+when converting MCP tool definitions to OpenAI format.
+"""
+
+from llama_stack.apis.tools import ToolParameter
+from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
+from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
+
+
+def test_mcp_tool_conversion_with_array_items():
+    """
+    Test that MCP tool parameters with array type and items field are properly converted.
+
+    This is a regression test for the bug where array parameters without 'items'
+    caused OpenAI API validation errors like:
+    "Invalid schema for function 'pods_exec': In context=('properties', 'command'),
+    array schema missing items."
+    """
+    # Create a tool parameter with array type and items specification
+    # This mimics what kubernetes-mcp-server's pods_exec tool has
+    tool_param = ToolParameter(
+        name="command",
+        parameter_type="array",
+        description="Command to execute in the pod",
+        required=True,
+        items={"type": "string"},  # This is the crucial field
+    )
+
+    # Convert to ToolDefinition format (as done in streaming.py)
+    tool_def = ToolDefinition(
+        tool_name="test_tool",
+        description="Test tool with array parameter",
+        parameters={
+            "command": ToolParamDefinition(
+                param_type=tool_param.parameter_type,
+                description=tool_param.description,
+                required=tool_param.required,
+                default=tool_param.default,
+                items=tool_param.items,  # The fix: ensure items is passed through
+            )
+        },
+    )
+
+    # Convert to OpenAI format
+    openai_tool = convert_tooldef_to_openai_tool(tool_def)
+
+    # Verify the conversion includes the items field
+    assert openai_tool["type"] == "function"
+    assert openai_tool["function"]["name"] == "test_tool"
+    assert "parameters" in openai_tool["function"]
+
+    parameters = openai_tool["function"]["parameters"]
+    assert "properties" in parameters
+    assert "command" in parameters["properties"]
+
+    command_param = parameters["properties"]["command"]
+    assert command_param["type"] == "array"
+    assert "items" in command_param, "Array parameter must have 'items' field for OpenAI API"
+    assert command_param["items"] == {"type": "string"}
+
+
+def test_mcp_tool_conversion_without_array():
+    """Test that non-array parameters work correctly without items field."""
+    tool_param = ToolParameter(
+        name="name",
+        parameter_type="string",
+        description="Name parameter",
+        required=True,
+    )
+
+    tool_def = ToolDefinition(
+        tool_name="test_tool",
+        description="Test tool with string parameter",
+        parameters={
+            "name": ToolParamDefinition(
+                param_type=tool_param.parameter_type,
+                description=tool_param.description,
+                required=tool_param.required,
+                items=tool_param.items,  # Will be None for non-array types
+            )
+        },
+    )
+
+    openai_tool = convert_tooldef_to_openai_tool(tool_def)
+
+    # Verify basic structure
+    assert openai_tool["type"] == "function"
+    parameters = openai_tool["function"]["parameters"]
+    assert "name" in parameters["properties"]
+
+    name_param = parameters["properties"]["name"]
+    assert name_param["type"] == "string"
+    # items should not be present for non-array types
+    assert "items" not in name_param or name_param.get("items") is None
+
+
+def test_mcp_tool_conversion_complex_array_items():
+    """Test array parameter with complex items schema (object type)."""
+    tool_param = ToolParameter(
+        name="configs",
+        parameter_type="array",
+        description="Array of configuration objects",
+        required=False,
+        items={
+            "type": "object",
+            "properties": {
+                "key": {"type": "string"},
+                "value": {"type": "string"},
+            },
+            "required": ["key"],
+        },
+    )
+
+    tool_def = ToolDefinition(
+        tool_name="test_tool",
+        description="Test tool with complex array parameter",
+        parameters={
+            "configs": ToolParamDefinition(
+                param_type=tool_param.parameter_type,
+                description=tool_param.description,
+                required=tool_param.required,
+                items=tool_param.items,
+            )
+        },
+    )
+
+    openai_tool = convert_tooldef_to_openai_tool(tool_def)
+
+    # Verify complex items schema is preserved
+    parameters = openai_tool["function"]["parameters"]
+    configs_param = parameters["properties"]["configs"]
+
+    assert configs_param["type"] == "array"
+    assert "items" in configs_param
+    assert configs_param["items"]["type"] == "object"
+    assert "properties" in configs_param["items"]
+    assert "key" in configs_param["items"]["properties"]
+    assert "value" in configs_param["items"]["properties"]

From fad9f6c4c9f2487a867c1434624ac37838dc6d84 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Mon, 29 Sep 2025 22:11:20 -0700
Subject: [PATCH 17/33] fix: mcp tool with array type should include items

# What does this PR do?


## Test Plan
---
 .../meta_reference/responses/streaming.py     | 49 ++++++++++++-------
 .../meta_reference/responses/__init__.py      |  5 ++
 .../responses/test_streaming.py               | 42 ++++++++++++++++
 3 files changed, 79 insertions(+), 17 deletions(-)
 create mode 100644 tests/unit/providers/inline/agents/meta_reference/responses/__init__.py
 create mode 100644 tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py

diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
index 3e69fa5cd..059d240f1 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -50,6 +50,37 @@ from .utils import convert_chat_choice_to_response_message, is_function_tool_cal
 logger = get_logger(name=__name__, category="agents::meta_reference")
 
 
+def convert_tooldef_to_chat_tool(tool_def):
+    """Convert a ToolDef to OpenAI ChatCompletionToolParam format.
+
+    Args:
+        tool_def: ToolDef from the tools API
+
+    Returns:
+        ChatCompletionToolParam suitable for OpenAI chat completion
+    """
+    from openai.types.chat import ChatCompletionToolParam
+
+    from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
+    from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
+
+    internal_tool_def = ToolDefinition(
+        tool_name=tool_def.name,
+        description=tool_def.description,
+        parameters={
+            param.name: ToolParamDefinition(
+                param_type=param.parameter_type,
+                description=param.description,
+                required=param.required,
+                default=param.default,
+                items=param.items,
+            )
+            for param in tool_def.parameters
+        },
+    )
+    return convert_tooldef_to_openai_tool(internal_tool_def)
+
+
 class StreamingResponseOrchestrator:
     def __init__(
         self,
@@ -556,23 +587,7 @@ class StreamingResponseOrchestrator:
                     continue
                 if not always_allowed or t.name in always_allowed:
                     # Add to chat tools for inference
-                    from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
-                    from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
-
-                    tool_def = ToolDefinition(
-                        tool_name=t.name,
-                        description=t.description,
-                        parameters={
-                            param.name: ToolParamDefinition(
-                                param_type=param.parameter_type,
-                                description=param.description,
-                                required=param.required,
-                                default=param.default,
-                            )
-                            for param in t.parameters
-                        },
-                    )
-                    openai_tool = convert_tooldef_to_openai_tool(tool_def)
+                    openai_tool = convert_tooldef_to_chat_tool(t)
                     if self.ctx.chat_tools is None:
                         self.ctx.chat_tools = []
                     self.ctx.chat_tools.append(openai_tool)
diff --git a/tests/unit/providers/inline/agents/meta_reference/responses/__init__.py b/tests/unit/providers/inline/agents/meta_reference/responses/__init__.py
new file mode 100644
index 000000000..6f3c1df03
--- /dev/null
+++ b/tests/unit/providers/inline/agents/meta_reference/responses/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
\ No newline at end of file
diff --git a/tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py b/tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py
new file mode 100644
index 000000000..6fda2b508
--- /dev/null
+++ b/tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.tools import ToolDef, ToolParameter
+from llama_stack.providers.inline.agents.meta_reference.responses.streaming import (
+    convert_tooldef_to_chat_tool,
+)
+
+
+def test_convert_tooldef_to_chat_tool_preserves_items_field():
+    """Test that array parameters preserve the items field during conversion.
+
+    This test ensures that when converting ToolDef with array-type parameters
+    to OpenAI ChatCompletionToolParam format, the 'items' field is preserved.
+    Without this fix, array parameters would be missing schema information about their items.
+    """
+    tool_def = ToolDef(
+        name="test_tool",
+        description="A test tool with array parameter",
+        parameters=[
+            ToolParameter(
+                name="tags",
+                parameter_type="array",
+                description="List of tags",
+                required=True,
+                items={"type": "string"},
+            )
+        ],
+    )
+
+    result = convert_tooldef_to_chat_tool(tool_def)
+
+    assert result["type"] == "function"
+    assert result["function"]["name"] == "test_tool"
+
+    tags_param = result["function"]["parameters"]["properties"]["tags"]
+    assert tags_param["type"] == "array"
+    assert "items" in tags_param, "items field should be preserved for array parameters"
+    assert tags_param["items"] == {"type": "string"}

From be97c9f9dfc184653cad974b3736da63af27ad24 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Mon, 29 Sep 2025 22:11:20 -0700
Subject: [PATCH 18/33] fix: mcp tool with array type should include items

# What does this PR do?


## Test Plan
---
 .../meta_reference/responses/streaming.py     | 48 ++++++++++++-------
 .../meta_reference/responses/__init__.py      |  5 ++
 .../responses/test_streaming.py               | 42 ++++++++++++++++
 3 files changed, 78 insertions(+), 17 deletions(-)
 create mode 100644 tests/unit/providers/inline/agents/meta_reference/responses/__init__.py
 create mode 100644 tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py

diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
index 3e69fa5cd..2f45ad2a3 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -50,6 +50,36 @@ from .utils import convert_chat_choice_to_response_message, is_function_tool_cal
 logger = get_logger(name=__name__, category="agents::meta_reference")
 
 
+def convert_tooldef_to_chat_tool(tool_def):
+    """Convert a ToolDef to OpenAI ChatCompletionToolParam format.
+
+    Args:
+        tool_def: ToolDef from the tools API
+
+    Returns:
+        ChatCompletionToolParam suitable for OpenAI chat completion
+    """
+
+    from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
+    from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
+
+    internal_tool_def = ToolDefinition(
+        tool_name=tool_def.name,
+        description=tool_def.description,
+        parameters={
+            param.name: ToolParamDefinition(
+                param_type=param.parameter_type,
+                description=param.description,
+                required=param.required,
+                default=param.default,
+                items=param.items,
+            )
+            for param in tool_def.parameters
+        },
+    )
+    return convert_tooldef_to_openai_tool(internal_tool_def)
+
+
 class StreamingResponseOrchestrator:
     def __init__(
         self,
@@ -556,23 +586,7 @@ class StreamingResponseOrchestrator:
                     continue
                 if not always_allowed or t.name in always_allowed:
                     # Add to chat tools for inference
-                    from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
-                    from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
-
-                    tool_def = ToolDefinition(
-                        tool_name=t.name,
-                        description=t.description,
-                        parameters={
-                            param.name: ToolParamDefinition(
-                                param_type=param.parameter_type,
-                                description=param.description,
-                                required=param.required,
-                                default=param.default,
-                            )
-                            for param in t.parameters
-                        },
-                    )
-                    openai_tool = convert_tooldef_to_openai_tool(tool_def)
+                    openai_tool = convert_tooldef_to_chat_tool(t)
                     if self.ctx.chat_tools is None:
                         self.ctx.chat_tools = []
                     self.ctx.chat_tools.append(openai_tool)
diff --git a/tests/unit/providers/inline/agents/meta_reference/responses/__init__.py b/tests/unit/providers/inline/agents/meta_reference/responses/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/tests/unit/providers/inline/agents/meta_reference/responses/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py b/tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py
new file mode 100644
index 000000000..6fda2b508
--- /dev/null
+++ b/tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.tools import ToolDef, ToolParameter
+from llama_stack.providers.inline.agents.meta_reference.responses.streaming import (
+    convert_tooldef_to_chat_tool,
+)
+
+
+def test_convert_tooldef_to_chat_tool_preserves_items_field():
+    """Test that array parameters preserve the items field during conversion.
+
+    This test ensures that when converting ToolDef with array-type parameters
+    to OpenAI ChatCompletionToolParam format, the 'items' field is preserved.
+    Without this fix, array parameters would be missing schema information about their items.
+    """
+    tool_def = ToolDef(
+        name="test_tool",
+        description="A test tool with array parameter",
+        parameters=[
+            ToolParameter(
+                name="tags",
+                parameter_type="array",
+                description="List of tags",
+                required=True,
+                items={"type": "string"},
+            )
+        ],
+    )
+
+    result = convert_tooldef_to_chat_tool(tool_def)
+
+    assert result["type"] == "function"
+    assert result["function"]["name"] == "test_tool"
+
+    tags_param = result["function"]["parameters"]["properties"]["tags"]
+    assert tags_param["type"] == "array"
+    assert "items" in tags_param, "items field should be preserved for array parameters"
+    assert tags_param["items"] == {"type": "string"}

From d87459790814f5cf87a4e2f33cb25e4dc73317d7 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Mon, 29 Sep 2025 22:39:06 -0700
Subject: [PATCH 19/33] fix: mcp tool with array type should include items

# What does this PR do?


## Test Plan
---
 .../meta_reference/responses/streaming.py     | 48 ++++++++++++-------
 tests/unit/providers/inline/__init__.py       |  0
 .../unit/providers/inline/agents/__init__.py  |  0
 .../inline/agents/meta_reference/__init__.py  |  0
 .../meta_reference/responses/__init__.py      |  5 ++
 .../responses/test_streaming.py               | 42 ++++++++++++++++
 6 files changed, 78 insertions(+), 17 deletions(-)
 create mode 100644 tests/unit/providers/inline/__init__.py
 create mode 100644 tests/unit/providers/inline/agents/__init__.py
 create mode 100644 tests/unit/providers/inline/agents/meta_reference/__init__.py
 create mode 100644 tests/unit/providers/inline/agents/meta_reference/responses/__init__.py
 create mode 100644 tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py

diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
index 3e69fa5cd..2f45ad2a3 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -50,6 +50,36 @@ from .utils import convert_chat_choice_to_response_message, is_function_tool_cal
 logger = get_logger(name=__name__, category="agents::meta_reference")
 
 
+def convert_tooldef_to_chat_tool(tool_def):
+    """Convert a ToolDef to OpenAI ChatCompletionToolParam format.
+
+    Args:
+        tool_def: ToolDef from the tools API
+
+    Returns:
+        ChatCompletionToolParam suitable for OpenAI chat completion
+    """
+
+    from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
+    from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
+
+    internal_tool_def = ToolDefinition(
+        tool_name=tool_def.name,
+        description=tool_def.description,
+        parameters={
+            param.name: ToolParamDefinition(
+                param_type=param.parameter_type,
+                description=param.description,
+                required=param.required,
+                default=param.default,
+                items=param.items,
+            )
+            for param in tool_def.parameters
+        },
+    )
+    return convert_tooldef_to_openai_tool(internal_tool_def)
+
+
 class StreamingResponseOrchestrator:
     def __init__(
         self,
@@ -556,23 +586,7 @@ class StreamingResponseOrchestrator:
                     continue
                 if not always_allowed or t.name in always_allowed:
                     # Add to chat tools for inference
-                    from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
-                    from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
-
-                    tool_def = ToolDefinition(
-                        tool_name=t.name,
-                        description=t.description,
-                        parameters={
-                            param.name: ToolParamDefinition(
-                                param_type=param.parameter_type,
-                                description=param.description,
-                                required=param.required,
-                                default=param.default,
-                            )
-                            for param in t.parameters
-                        },
-                    )
-                    openai_tool = convert_tooldef_to_openai_tool(tool_def)
+                    openai_tool = convert_tooldef_to_chat_tool(t)
                     if self.ctx.chat_tools is None:
                         self.ctx.chat_tools = []
                     self.ctx.chat_tools.append(openai_tool)
diff --git a/tests/unit/providers/inline/__init__.py b/tests/unit/providers/inline/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/unit/providers/inline/agents/__init__.py b/tests/unit/providers/inline/agents/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/unit/providers/inline/agents/meta_reference/__init__.py b/tests/unit/providers/inline/agents/meta_reference/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/unit/providers/inline/agents/meta_reference/responses/__init__.py b/tests/unit/providers/inline/agents/meta_reference/responses/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/tests/unit/providers/inline/agents/meta_reference/responses/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py b/tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py
new file mode 100644
index 000000000..6fda2b508
--- /dev/null
+++ b/tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.tools import ToolDef, ToolParameter
+from llama_stack.providers.inline.agents.meta_reference.responses.streaming import (
+    convert_tooldef_to_chat_tool,
+)
+
+
+def test_convert_tooldef_to_chat_tool_preserves_items_field():
+    """Test that array parameters preserve the items field during conversion.
+
+    This test ensures that when converting ToolDef with array-type parameters
+    to OpenAI ChatCompletionToolParam format, the 'items' field is preserved.
+    Without this fix, array parameters would be missing schema information about their items.
+    """
+    tool_def = ToolDef(
+        name="test_tool",
+        description="A test tool with array parameter",
+        parameters=[
+            ToolParameter(
+                name="tags",
+                parameter_type="array",
+                description="List of tags",
+                required=True,
+                items={"type": "string"},
+            )
+        ],
+    )
+
+    result = convert_tooldef_to_chat_tool(tool_def)
+
+    assert result["type"] == "function"
+    assert result["function"]["name"] == "test_tool"
+
+    tags_param = result["function"]["parameters"]["properties"]["tags"]
+    assert tags_param["type"] == "array"
+    assert "items" in tags_param, "items field should be preserved for array parameters"
+    assert tags_param["items"] == {"type": "string"}

From b2694a362055ea79afc9db35bb42208fd34b6e52 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Mon, 29 Sep 2025 22:43:20 -0700
Subject: [PATCH 20/33] fix: mcp tool with array type should include items

# What does this PR do?


## Test Plan
---
 .../meta_reference/responses/streaming.py     | 48 ++++++++++++-------
 tests/unit/providers/inline/__init__.py       |  6 +++
 .../unit/providers/inline/agents/__init__.py  |  6 +++
 .../inline/agents/meta_reference/__init__.py  |  6 +++
 .../meta_reference/responses/__init__.py      |  5 ++
 .../responses/test_streaming.py               | 42 ++++++++++++++++
 6 files changed, 96 insertions(+), 17 deletions(-)
 create mode 100644 tests/unit/providers/inline/__init__.py
 create mode 100644 tests/unit/providers/inline/agents/__init__.py
 create mode 100644 tests/unit/providers/inline/agents/meta_reference/__init__.py
 create mode 100644 tests/unit/providers/inline/agents/meta_reference/responses/__init__.py
 create mode 100644 tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py

diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
index 3e69fa5cd..2f45ad2a3 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -50,6 +50,36 @@ from .utils import convert_chat_choice_to_response_message, is_function_tool_cal
 logger = get_logger(name=__name__, category="agents::meta_reference")
 
 
+def convert_tooldef_to_chat_tool(tool_def):
+    """Convert a ToolDef to OpenAI ChatCompletionToolParam format.
+
+    Args:
+        tool_def: ToolDef from the tools API
+
+    Returns:
+        ChatCompletionToolParam suitable for OpenAI chat completion
+    """
+
+    from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
+    from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
+
+    internal_tool_def = ToolDefinition(
+        tool_name=tool_def.name,
+        description=tool_def.description,
+        parameters={
+            param.name: ToolParamDefinition(
+                param_type=param.parameter_type,
+                description=param.description,
+                required=param.required,
+                default=param.default,
+                items=param.items,
+            )
+            for param in tool_def.parameters
+        },
+    )
+    return convert_tooldef_to_openai_tool(internal_tool_def)
+
+
 class StreamingResponseOrchestrator:
     def __init__(
         self,
@@ -556,23 +586,7 @@ class StreamingResponseOrchestrator:
                     continue
                 if not always_allowed or t.name in always_allowed:
                     # Add to chat tools for inference
-                    from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
-                    from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
-
-                    tool_def = ToolDefinition(
-                        tool_name=t.name,
-                        description=t.description,
-                        parameters={
-                            param.name: ToolParamDefinition(
-                                param_type=param.parameter_type,
-                                description=param.description,
-                                required=param.required,
-                                default=param.default,
-                            )
-                            for param in t.parameters
-                        },
-                    )
-                    openai_tool = convert_tooldef_to_openai_tool(tool_def)
+                    openai_tool = convert_tooldef_to_chat_tool(t)
                     if self.ctx.chat_tools is None:
                         self.ctx.chat_tools = []
                     self.ctx.chat_tools.append(openai_tool)
diff --git a/tests/unit/providers/inline/__init__.py b/tests/unit/providers/inline/__init__.py
new file mode 100644
index 000000000..d4a3e15c8
--- /dev/null
+++ b/tests/unit/providers/inline/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
diff --git a/tests/unit/providers/inline/agents/__init__.py b/tests/unit/providers/inline/agents/__init__.py
new file mode 100644
index 000000000..d4a3e15c8
--- /dev/null
+++ b/tests/unit/providers/inline/agents/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
diff --git a/tests/unit/providers/inline/agents/meta_reference/__init__.py b/tests/unit/providers/inline/agents/meta_reference/__init__.py
new file mode 100644
index 000000000..d4a3e15c8
--- /dev/null
+++ b/tests/unit/providers/inline/agents/meta_reference/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
diff --git a/tests/unit/providers/inline/agents/meta_reference/responses/__init__.py b/tests/unit/providers/inline/agents/meta_reference/responses/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/tests/unit/providers/inline/agents/meta_reference/responses/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py b/tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py
new file mode 100644
index 000000000..6fda2b508
--- /dev/null
+++ b/tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.tools import ToolDef, ToolParameter
+from llama_stack.providers.inline.agents.meta_reference.responses.streaming import (
+    convert_tooldef_to_chat_tool,
+)
+
+
+def test_convert_tooldef_to_chat_tool_preserves_items_field():
+    """Test that array parameters preserve the items field during conversion.
+
+    This test ensures that when converting ToolDef with array-type parameters
+    to OpenAI ChatCompletionToolParam format, the 'items' field is preserved.
+    Without this fix, array parameters would be missing schema information about their items.
+    """
+    tool_def = ToolDef(
+        name="test_tool",
+        description="A test tool with array parameter",
+        parameters=[
+            ToolParameter(
+                name="tags",
+                parameter_type="array",
+                description="List of tags",
+                required=True,
+                items={"type": "string"},
+            )
+        ],
+    )
+
+    result = convert_tooldef_to_chat_tool(tool_def)
+
+    assert result["type"] == "function"
+    assert result["function"]["name"] == "test_tool"
+
+    tags_param = result["function"]["parameters"]["properties"]["tags"]
+    assert tags_param["type"] == "array"
+    assert "items" in tags_param, "items field should be preserved for array parameters"
+    assert tags_param["items"] == {"type": "string"}

From 19ca0d0d9c1d850e95a15f54ef9713f7c812fcc4 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Mon, 29 Sep 2025 22:58:30 -0700
Subject: [PATCH 21/33] fix: mcp tool with array type should include items

# What does this PR do?


## Test Plan
---
 .../meta_reference/responses/streaming.py     | 48 ++++++++++++-------
 tests/unit/providers/inline/__init__.py       |  5 ++
 .../unit/providers/inline/agents/__init__.py  |  5 ++
 .../inline/agents/meta_reference/__init__.py  |  5 ++
 .../meta_reference/responses/__init__.py      |  5 ++
 .../responses/test_streaming.py               | 42 ++++++++++++++++
 6 files changed, 93 insertions(+), 17 deletions(-)
 create mode 100644 tests/unit/providers/inline/__init__.py
 create mode 100644 tests/unit/providers/inline/agents/__init__.py
 create mode 100644 tests/unit/providers/inline/agents/meta_reference/__init__.py
 create mode 100644 tests/unit/providers/inline/agents/meta_reference/responses/__init__.py
 create mode 100644 tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py

diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
index 3e69fa5cd..2f45ad2a3 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -50,6 +50,36 @@ from .utils import convert_chat_choice_to_response_message, is_function_tool_cal
 logger = get_logger(name=__name__, category="agents::meta_reference")
 
 
+def convert_tooldef_to_chat_tool(tool_def):
+    """Convert a ToolDef to OpenAI ChatCompletionToolParam format.
+
+    Args:
+        tool_def: ToolDef from the tools API
+
+    Returns:
+        ChatCompletionToolParam suitable for OpenAI chat completion
+    """
+
+    from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
+    from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
+
+    internal_tool_def = ToolDefinition(
+        tool_name=tool_def.name,
+        description=tool_def.description,
+        parameters={
+            param.name: ToolParamDefinition(
+                param_type=param.parameter_type,
+                description=param.description,
+                required=param.required,
+                default=param.default,
+                items=param.items,
+            )
+            for param in tool_def.parameters
+        },
+    )
+    return convert_tooldef_to_openai_tool(internal_tool_def)
+
+
 class StreamingResponseOrchestrator:
     def __init__(
         self,
@@ -556,23 +586,7 @@ class StreamingResponseOrchestrator:
                     continue
                 if not always_allowed or t.name in always_allowed:
                     # Add to chat tools for inference
-                    from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
-                    from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
-
-                    tool_def = ToolDefinition(
-                        tool_name=t.name,
-                        description=t.description,
-                        parameters={
-                            param.name: ToolParamDefinition(
-                                param_type=param.parameter_type,
-                                description=param.description,
-                                required=param.required,
-                                default=param.default,
-                            )
-                            for param in t.parameters
-                        },
-                    )
-                    openai_tool = convert_tooldef_to_openai_tool(tool_def)
+                    openai_tool = convert_tooldef_to_chat_tool(t)
                     if self.ctx.chat_tools is None:
                         self.ctx.chat_tools = []
                     self.ctx.chat_tools.append(openai_tool)
diff --git a/tests/unit/providers/inline/__init__.py b/tests/unit/providers/inline/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/tests/unit/providers/inline/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/tests/unit/providers/inline/agents/__init__.py b/tests/unit/providers/inline/agents/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/tests/unit/providers/inline/agents/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/tests/unit/providers/inline/agents/meta_reference/__init__.py b/tests/unit/providers/inline/agents/meta_reference/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/tests/unit/providers/inline/agents/meta_reference/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/tests/unit/providers/inline/agents/meta_reference/responses/__init__.py b/tests/unit/providers/inline/agents/meta_reference/responses/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/tests/unit/providers/inline/agents/meta_reference/responses/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py b/tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py
new file mode 100644
index 000000000..6fda2b508
--- /dev/null
+++ b/tests/unit/providers/inline/agents/meta_reference/responses/test_streaming.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.tools import ToolDef, ToolParameter
+from llama_stack.providers.inline.agents.meta_reference.responses.streaming import (
+    convert_tooldef_to_chat_tool,
+)
+
+
+def test_convert_tooldef_to_chat_tool_preserves_items_field():
+    """Test that array parameters preserve the items field during conversion.
+
+    This test ensures that when converting ToolDef with array-type parameters
+    to OpenAI ChatCompletionToolParam format, the 'items' field is preserved.
+    Without this fix, array parameters would be missing schema information about their items.
+    """
+    tool_def = ToolDef(
+        name="test_tool",
+        description="A test tool with array parameter",
+        parameters=[
+            ToolParameter(
+                name="tags",
+                parameter_type="array",
+                description="List of tags",
+                required=True,
+                items={"type": "string"},
+            )
+        ],
+    )
+
+    result = convert_tooldef_to_chat_tool(tool_def)
+
+    assert result["type"] == "function"
+    assert result["function"]["name"] == "test_tool"
+
+    tags_param = result["function"]["parameters"]["properties"]["tags"]
+    assert tags_param["type"] == "array"
+    assert "items" in tags_param, "items field should be preserved for array parameters"
+    assert tags_param["items"] == {"type": "string"}

From 0cc072dcafb3426894eb77231b4d90b3edd06196 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Tue, 30 Sep 2025 11:24:27 -0700
Subject: [PATCH 22/33] fix: don't pass default response format in Responses

# What does this PR do?


## Test Plan
---
 .../inline/agents/meta_reference/responses/streaming.py      | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
index 2f45ad2a3..179f7f023 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -127,13 +127,16 @@ class StreamingResponseOrchestrator:
         messages = self.ctx.messages.copy()
 
         while True:
+            # Text is the default response format for chat completion so don't need to pass it
+            # (some providers don't support non-empty response_format when tools are present)
+            response_format = None if self.ctx.response_format.type == "text" else self.ctx.response_format
             completion_result = await self.inference_api.openai_chat_completion(
                 model=self.ctx.model,
                 messages=messages,
                 tools=self.ctx.chat_tools,
                 stream=True,
                 temperature=self.ctx.temperature,
-                response_format=self.ctx.response_format,
+                response_format=response_format,
             )
 
             # Process streaming chunks and build complete response

From a03f0cabfd1c17c4b30af84fcd337dda668414ee Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Tue, 30 Sep 2025 11:28:31 -0700
Subject: [PATCH 23/33] fix: don't pass default response format in Responses

# What does this PR do?


## Test Plan
---
 .../inline/agents/meta_reference/responses/streaming.py    | 5 ++++-
 .../agents/meta_reference/test_openai_responses.py         | 7 +++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
index 2f45ad2a3..179f7f023 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -127,13 +127,16 @@ class StreamingResponseOrchestrator:
         messages = self.ctx.messages.copy()
 
         while True:
+            # Text is the default response format for chat completion so don't need to pass it
+            # (some providers don't support non-empty response_format when tools are present)
+            response_format = None if self.ctx.response_format.type == "text" else self.ctx.response_format
             completion_result = await self.inference_api.openai_chat_completion(
                 model=self.ctx.model,
                 messages=messages,
                 tools=self.ctx.chat_tools,
                 stream=True,
                 temperature=self.ctx.temperature,
-                response_format=self.ctx.response_format,
+                response_format=response_format,
             )
 
             # Process streaming chunks and build complete response
diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
index 38ce365c1..eb77c2dbe 100644
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@@ -148,7 +148,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
     mock_inference_api.openai_chat_completion.assert_called_once_with(
         model=model,
         messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)],
-        response_format=OpenAIResponseFormatText(),
+        response_format=None,
         tools=None,
         stream=True,
         temperature=0.1,
@@ -831,8 +831,8 @@ async def test_store_response_uses_rehydrated_input_with_previous_response(
         (OpenAIResponseText(format=OpenAIResponseTextFormat(type="json_object")), OpenAIResponseFormatJSONObject()),
         # ensure text param with no format specified defaults to text
         (OpenAIResponseText(format=None), OpenAIResponseFormatText()),
-        # ensure text param of None defaults to text
-        (None, OpenAIResponseFormatText()),
+        # ensure text param of None defaults to None
+        (None, None),
     ],
 )
 async def test_create_openai_response_with_text_format(
@@ -855,7 +855,6 @@ async def test_create_openai_response_with_text_format(
     # Verify
     first_call = mock_inference_api.openai_chat_completion.call_args_list[0]
     assert first_call.kwargs["messages"][0].content == input_text
-    assert first_call.kwargs["response_format"] is not None
     assert first_call.kwargs["response_format"] == response_format
 
 
From 28cc185cbbb49472f3c4b50c579e30559e5f475e Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Tue, 30 Sep 2025 11:28:31 -0700
Subject: [PATCH 24/33] fix: don't pass default response format in Responses

# What does this PR do?


## Test Plan
---
 .../agents/meta_reference/responses/streaming.py    |  5 ++++-
 .../agents/meta_reference/test_openai_responses.py  | 13 ++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
index 2f45ad2a3..179f7f023 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -127,13 +127,16 @@ class StreamingResponseOrchestrator:
         messages = self.ctx.messages.copy()
 
         while True:
+            # Text is the default response format for chat completion so don't need to pass it
+            # (some providers don't support non-empty response_format when tools are present)
+            response_format = None if self.ctx.response_format.type == "text" else self.ctx.response_format
             completion_result = await self.inference_api.openai_chat_completion(
                 model=self.ctx.model,
                 messages=messages,
                 tools=self.ctx.chat_tools,
                 stream=True,
                 temperature=self.ctx.temperature,
-                response_format=self.ctx.response_format,
+                response_format=response_format,
             )
 
             # Process streaming chunks and build complete response
diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
index 38ce365c1..ed60c5bdc 100644
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@@ -148,7 +148,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
     mock_inference_api.openai_chat_completion.assert_called_once_with(
         model=model,
         messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)],
-        response_format=OpenAIResponseFormatText(),
+        response_format=None,
         tools=None,
         stream=True,
         temperature=0.1,
@@ -823,16 +823,16 @@ async def test_store_response_uses_rehydrated_input_with_previous_response(
 @pytest.mark.parametrize(
     "text_format, response_format",
     [
-        (OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")), OpenAIResponseFormatText()),
+        (OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")), None),
         (
             OpenAIResponseText(format=OpenAIResponseTextFormat(name="Test", schema={"foo": "bar"}, type="json_schema")),
             OpenAIResponseFormatJSONSchema(json_schema=OpenAIJSONSchema(name="Test", schema={"foo": "bar"})),
         ),
         (OpenAIResponseText(format=OpenAIResponseTextFormat(type="json_object")), OpenAIResponseFormatJSONObject()),
-        # ensure text param with no format specified defaults to text
-        (OpenAIResponseText(format=None), OpenAIResponseFormatText()),
-        # ensure text param of None defaults to text
-        (None, OpenAIResponseFormatText()),
+        # ensure text param with no format specified defaults to None
+        (OpenAIResponseText(format=None), None),
+        # ensure text param of None defaults to None
+        (None, None),
     ],
 )
 async def test_create_openai_response_with_text_format(
@@ -855,7 +855,6 @@ async def test_create_openai_response_with_text_format(
     # Verify
     first_call = mock_inference_api.openai_chat_completion.call_args_list[0]
     assert first_call.kwargs["messages"][0].content == input_text
-    assert first_call.kwargs["response_format"] is not None
     assert first_call.kwargs["response_format"] == response_format
 
 
From f387e4023f4e2be312fbde60f0c3855a58ad8640 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Tue, 30 Sep 2025 11:33:57 -0700
Subject: [PATCH 25/33] fix: don't pass default response format in Responses

# What does this PR do?


## Test Plan
---
 .../agents/meta_reference/responses/streaming.py   |  5 ++++-
 .../agents/meta_reference/test_openai_responses.py | 14 ++++++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
index 2f45ad2a3..179f7f023 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -127,13 +127,16 @@ class StreamingResponseOrchestrator:
         messages = self.ctx.messages.copy()
 
         while True:
+            # Text is the default response format for chat completion so don't need to pass it
+            # (some providers don't support non-empty response_format when tools are present)
+            response_format = None if self.ctx.response_format.type == "text" else self.ctx.response_format
             completion_result = await self.inference_api.openai_chat_completion(
                 model=self.ctx.model,
                 messages=messages,
                 tools=self.ctx.chat_tools,
                 stream=True,
                 temperature=self.ctx.temperature,
-                response_format=self.ctx.response_format,
+                response_format=response_format,
             )
 
             # Process streaming chunks and build complete response
diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
index 38ce365c1..5e5914a03 100644
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@@ -37,7 +37,6 @@ from llama_stack.apis.inference import (
     OpenAIJSONSchema,
     OpenAIResponseFormatJSONObject,
     OpenAIResponseFormatJSONSchema,
-    OpenAIResponseFormatText,
     OpenAIUserMessageParam,
 )
 from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime
@@ -148,7 +147,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
     mock_inference_api.openai_chat_completion.assert_called_once_with(
         model=model,
         messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)],
-        response_format=OpenAIResponseFormatText(),
+        response_format=None,
         tools=None,
         stream=True,
         temperature=0.1,
@@ -823,16 +822,16 @@ async def test_store_response_uses_rehydrated_input_with_previous_response(
 @pytest.mark.parametrize(
     "text_format, response_format",
     [
-        (OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")), OpenAIResponseFormatText()),
+        (OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")), None),
         (
             OpenAIResponseText(format=OpenAIResponseTextFormat(name="Test", schema={"foo": "bar"}, type="json_schema")),
             OpenAIResponseFormatJSONSchema(json_schema=OpenAIJSONSchema(name="Test", schema={"foo": "bar"})),
         ),
         (OpenAIResponseText(format=OpenAIResponseTextFormat(type="json_object")), OpenAIResponseFormatJSONObject()),
-        # ensure text param with no format specified defaults to text
-        (OpenAIResponseText(format=None), OpenAIResponseFormatText()),
-        # ensure text param of None defaults to text
-        (None, OpenAIResponseFormatText()),
+        # ensure text param with no format specified defaults to None
+        (OpenAIResponseText(format=None), None),
+        # ensure text param of None defaults to None
+        (None, None),
     ],
 )
 async def test_create_openai_response_with_text_format(
@@ -855,7 +854,6 @@ async def test_create_openai_response_with_text_format(
     # Verify
     first_call = mock_inference_api.openai_chat_completion.call_args_list[0]
     assert first_call.kwargs["messages"][0].content == input_text
-    assert first_call.kwargs["response_format"] is not None
     assert first_call.kwargs["response_format"] == response_format
 
 
From f034004ae63480df0b672aabef323398c6f69841 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Tue, 30 Sep 2025 12:04:50 -0700
Subject: [PATCH 26/33] fix: don't pass default response format in Responses

# What does this PR do?


## Test Plan
---
 .../meta_reference/responses/streaming.py     |   5 +-
 .../recordings/responses/cf776b1aa432.json    | 232 +++++
 .../recordings/responses/d0ac68cbde69.json    |  16 +-
 .../models-7d9446738fd7-d5d684a3.json         | 144 ++--
 .../models-bd032f995f2a-7467c0cf.json         |  69 ++
 .../models-bd032f995f2a-ebaa996d.json         | 798 ++++++++++++++++++
 .../meta_reference/test_openai_responses.py   |  14 +-
 7 files changed, 1189 insertions(+), 89 deletions(-)
 create mode 100644 tests/integration/recordings/responses/cf776b1aa432.json
 create mode 100644 tests/integration/recordings/responses/models-bd032f995f2a-7467c0cf.json
 create mode 100644 tests/integration/recordings/responses/models-bd032f995f2a-ebaa996d.json

diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
index 2f45ad2a3..179f7f023 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -127,13 +127,16 @@ class StreamingResponseOrchestrator:
         messages = self.ctx.messages.copy()
 
         while True:
+            # Text is the default response format for chat completion so don't need to pass it
+            # (some providers don't support non-empty response_format when tools are present)
+            response_format = None if self.ctx.response_format.type == "text" else self.ctx.response_format
             completion_result = await self.inference_api.openai_chat_completion(
                 model=self.ctx.model,
                 messages=messages,
                 tools=self.ctx.chat_tools,
                 stream=True,
                 temperature=self.ctx.temperature,
-                response_format=self.ctx.response_format,
+                response_format=response_format,
             )
 
             # Process streaming chunks and build complete response
diff --git a/tests/integration/recordings/responses/cf776b1aa432.json b/tests/integration/recordings/responses/cf776b1aa432.json
new file mode 100644
index 000000000..c7449427a
--- /dev/null
+++ b/tests/integration/recordings/responses/cf776b1aa432.json
@@ -0,0 +1,232 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What is the capital of France?"
+        }
+      ],
+      "stream": true
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-78",
+          "choices": [
+            {
+              "delta": {
+                "content": "The",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759259077,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-78",
+          "choices": [
+            {
+              "delta": {
+                "content": " capital",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759259077,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-78",
+          "choices": [
+            {
+              "delta": {
+                "content": " of",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759259077,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-78",
+          "choices": [
+            {
+              "delta": {
+                "content": " France",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759259077,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-78",
+          "choices": [
+            {
+              "delta": {
+                "content": " is",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759259077,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-78",
+          "choices": [
+            {
+              "delta": {
+                "content": " Paris",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759259077,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-78",
+          "choices": [
+            {
+              "delta": {
+                "content": ".",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759259077,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-78",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759259077,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
diff --git a/tests/integration/recordings/responses/d0ac68cbde69.json b/tests/integration/recordings/responses/d0ac68cbde69.json
index 78784e0ca..a8c46b76b 100644
--- a/tests/integration/recordings/responses/d0ac68cbde69.json
+++ b/tests/integration/recordings/responses/d0ac68cbde69.json
@@ -13,12 +13,12 @@
       "__data__": {
         "models": [
           {
-            "model": "llama3.2:3b",
-            "name": "llama3.2:3b",
-            "digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
-            "expires_at": "2025-09-27T11:54:56.718552-07:00",
-            "size": 3367856128,
-            "size_vram": 3367856128,
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-09-30T12:07:39.189179-07:00",
+            "size": 8581748736,
+            "size_vram": 8581748736,
             "details": {
               "parent_model": "",
               "format": "gguf",
@@ -27,9 +27,9 @@
                 "llama"
               ],
               "parameter_size": "3.2B",
-              "quantization_level": "Q4_K_M"
+              "quantization_level": "F16"
             },
-            "context_length": 4096
+            "context_length": null
           }
         ]
       }
diff --git a/tests/integration/recordings/responses/models-7d9446738fd7-d5d684a3.json b/tests/integration/recordings/responses/models-7d9446738fd7-d5d684a3.json
index a76f0ba8f..d9917b2ec 100644
--- a/tests/integration/recordings/responses/models-7d9446738fd7-d5d684a3.json
+++ b/tests/integration/recordings/responses/models-7d9446738fd7-d5d684a3.json
@@ -22,19 +22,6 @@
           "supports_tools": false
         }
       },
-      {
-        "__type__": "openai.types.model.Model",
-        "__data__": {
-          "id": "accounts/tvergho-87e44d/models/debatecards-70b-ft-3epoch-dpo-v2",
-          "created": 1743381121,
-          "object": "model",
-          "owned_by": "tvergho-87e44d",
-          "kind": "HF_PEFT_ADDON",
-          "supports_chat": true,
-          "supports_image_input": false,
-          "supports_tools": false
-        }
-      },
       {
         "__type__": "openai.types.model.Model",
         "__data__": {
@@ -75,20 +62,6 @@
           "context_length": 131072
         }
       },
-      {
-        "__type__": "openai.types.model.Model",
-        "__data__": {
-          "id": "accounts/fireworks/models/deepseek-v3",
-          "created": 1735576668,
-          "object": "model",
-          "owned_by": "fireworks",
-          "kind": "HF_BASE_MODEL",
-          "supports_chat": true,
-          "supports_image_input": false,
-          "supports_tools": true,
-          "context_length": 131072
-        }
-      },
       {
         "__type__": "openai.types.model.Model",
         "__data__": {
@@ -259,17 +232,45 @@
       {
         "__type__": "openai.types.model.Model",
         "__data__": {
-          "id": "accounts/fireworks/models/qwen3-coder-30b-a3b-instruct",
-          "created": 1754063588,
+          "id": "accounts/fireworks/models/kimi-k2-instruct-0905",
+          "created": 1757018994,
           "object": "model",
           "owned_by": "fireworks",
           "kind": "HF_BASE_MODEL",
           "supports_chat": true,
           "supports_image_input": false,
-          "supports_tools": false,
+          "supports_tools": true,
           "context_length": 262144
         }
       },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/glm-4p5",
+          "created": 1753809636,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 131072
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/deepseek-v3",
+          "created": 1735576668,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 131072
+        }
+      },
       {
         "__type__": "openai.types.model.Model",
         "__data__": {
@@ -284,20 +285,6 @@
           "context_length": 131072
         }
       },
-      {
-        "__type__": "openai.types.model.Model",
-        "__data__": {
-          "id": "accounts/fireworks/models/qwen2p5-vl-32b-instruct",
-          "created": 1743392739,
-          "object": "model",
-          "owned_by": "fireworks",
-          "kind": "HF_BASE_MODEL",
-          "supports_chat": true,
-          "supports_image_input": true,
-          "supports_tools": false,
-          "context_length": 128000
-        }
-      },
       {
         "__type__": "openai.types.model.Model",
         "__data__": {
@@ -395,34 +382,6 @@
           "supports_tools": false
         }
       },
-      {
-        "__type__": "openai.types.model.Model",
-        "__data__": {
-          "id": "accounts/fireworks/models/glm-4p5",
-          "created": 1753809636,
-          "object": "model",
-          "owned_by": "fireworks",
-          "kind": "HF_BASE_MODEL",
-          "supports_chat": true,
-          "supports_image_input": false,
-          "supports_tools": true,
-          "context_length": 131072
-        }
-      },
-      {
-        "__type__": "openai.types.model.Model",
-        "__data__": {
-          "id": "accounts/fireworks/models/kimi-k2-instruct-0905",
-          "created": 1757018994,
-          "object": "model",
-          "owned_by": "fireworks",
-          "kind": "HF_BASE_MODEL",
-          "supports_chat": true,
-          "supports_image_input": false,
-          "supports_tools": true,
-          "context_length": 262144
-        }
-      },
       {
         "__type__": "openai.types.model.Model",
         "__data__": {
@@ -520,6 +479,47 @@
           "supports_tools": false,
           "context_length": 262144
         }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/qwen2p5-vl-32b-instruct",
+          "created": 1743392739,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": true,
+          "supports_tools": false,
+          "context_length": 128000
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/qwen3-coder-30b-a3b-instruct",
+          "created": 1754063588,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": false,
+          "context_length": 262144
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/tvergho-87e44d/models/debatecards-70b-ft-3epoch-dpo-v2",
+          "created": 1743381121,
+          "object": "model",
+          "owned_by": "tvergho-87e44d",
+          "kind": "HF_PEFT_ADDON",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": false
+        }
       }
     ],
     "is_streaming": false
diff --git a/tests/integration/recordings/responses/models-bd032f995f2a-7467c0cf.json b/tests/integration/recordings/responses/models-bd032f995f2a-7467c0cf.json
new file mode 100644
index 000000000..00c447dcc
--- /dev/null
+++ b/tests/integration/recordings/responses/models-bd032f995f2a-7467c0cf.json
@@ -0,0 +1,69 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/models",
+    "headers": {},
+    "body": {},
+    "endpoint": "/v1/models",
+    "model": ""
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "nomic-embed-text:latest",
+          "created": 1754610899,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama-guard3:1b",
+          "created": 1754088388,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "all-minilm:l6-v2",
+          "created": 1753826826,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "all-minilm:latest",
+          "created": 1749064003,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama3.1:8b-instruct-fp16",
+          "created": 1739575404,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama3.2:3b-instruct-fp16",
+          "created": 1737496003,
+          "object": "model",
+          "owned_by": "library"
+        }
+      }
+    ],
+    "is_streaming": false
+  }
+}
diff --git a/tests/integration/recordings/responses/models-bd032f995f2a-ebaa996d.json b/tests/integration/recordings/responses/models-bd032f995f2a-ebaa996d.json
new file mode 100644
index 000000000..c460d6977
--- /dev/null
+++ b/tests/integration/recordings/responses/models-bd032f995f2a-ebaa996d.json
@@ -0,0 +1,798 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.openai.com/v1/v1/models",
+    "headers": {},
+    "body": {},
+    "endpoint": "/v1/models",
+    "model": ""
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-0613",
+          "created": 1686588896,
+          "object": "model",
+          "owned_by": "openai"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4",
+          "created": 1687882411,
+          "object": "model",
+          "owned_by": "openai"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo",
+          "created": 1677610602,
+          "object": "model",
+          "owned_by": "openai"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-codex",
+          "created": 1757527818,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-audio-2025-08-28",
+          "created": 1756256146,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-realtime",
+          "created": 1756271701,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-realtime-2025-08-28",
+          "created": 1756271773,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-audio",
+          "created": 1756339249,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "davinci-002",
+          "created": 1692634301,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "babbage-002",
+          "created": 1692634615,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-instruct",
+          "created": 1692901427,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-instruct-0914",
+          "created": 1694122472,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "dall-e-3",
+          "created": 1698785189,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "dall-e-2",
+          "created": 1698798177,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-1106-preview",
+          "created": 1698957206,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-1106",
+          "created": 1698959748,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "tts-1-hd",
+          "created": 1699046015,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "tts-1-1106",
+          "created": 1699053241,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "tts-1-hd-1106",
+          "created": 1699053533,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "text-embedding-3-small",
+          "created": 1705948997,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "text-embedding-3-large",
+          "created": 1705953180,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-0125-preview",
+          "created": 1706037612,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-turbo-preview",
+          "created": 1706037777,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-0125",
+          "created": 1706048358,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-turbo",
+          "created": 1712361441,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-turbo-2024-04-09",
+          "created": 1712601677,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o",
+          "created": 1715367049,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-2024-05-13",
+          "created": 1715368132,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-2024-07-18",
+          "created": 1721172717,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini",
+          "created": 1721172741,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-2024-08-06",
+          "created": 1722814719,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "chatgpt-4o-latest",
+          "created": 1723515131,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-mini-2024-09-12",
+          "created": 1725648979,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-mini",
+          "created": 1725649008,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-realtime-preview-2024-10-01",
+          "created": 1727131766,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-audio-preview-2024-10-01",
+          "created": 1727389042,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-audio-preview",
+          "created": 1727460443,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-realtime-preview",
+          "created": 1727659998,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "omni-moderation-latest",
+          "created": 1731689265,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "omni-moderation-2024-09-26",
+          "created": 1732734466,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-realtime-preview-2024-12-17",
+          "created": 1733945430,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-audio-preview-2024-12-17",
+          "created": 1734034239,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-realtime-preview-2024-12-17",
+          "created": 1734112601,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-audio-preview-2024-12-17",
+          "created": 1734115920,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-2024-12-17",
+          "created": 1734326976,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1",
+          "created": 1734375816,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-realtime-preview",
+          "created": 1734387380,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-audio-preview",
+          "created": 1734387424,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3-mini",
+          "created": 1737146383,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3-mini-2025-01-31",
+          "created": 1738010200,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-2024-11-20",
+          "created": 1739331543,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-search-preview-2025-03-11",
+          "created": 1741388170,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-search-preview",
+          "created": 1741388720,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-search-preview-2025-03-11",
+          "created": 1741390858,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-search-preview",
+          "created": 1741391161,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-transcribe",
+          "created": 1742068463,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-transcribe",
+          "created": 1742068596,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-pro-2025-03-19",
+          "created": 1742251504,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-pro",
+          "created": 1742251791,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-tts",
+          "created": 1742403959,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3-2025-04-16",
+          "created": 1744133301,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o4-mini-2025-04-16",
+          "created": 1744133506,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3",
+          "created": 1744225308,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o4-mini",
+          "created": 1744225351,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-2025-04-14",
+          "created": 1744315746,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1",
+          "created": 1744316542,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-mini-2025-04-14",
+          "created": 1744317547,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-mini",
+          "created": 1744318173,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-nano-2025-04-14",
+          "created": 1744321025,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-nano",
+          "created": 1744321707,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-image-1",
+          "created": 1745517030,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "codex-mini-latest",
+          "created": 1746673257,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-realtime-preview-2025-06-03",
+          "created": 1748907838,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-audio-preview-2025-06-03",
+          "created": 1748908498,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o4-mini-deep-research",
+          "created": 1749685485,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o4-mini-deep-research-2025-06-26",
+          "created": 1750866121,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-chat-latest",
+          "created": 1754073306,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-2025-08-07",
+          "created": 1754075360,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5",
+          "created": 1754425777,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-mini-2025-08-07",
+          "created": 1754425867,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-mini",
+          "created": 1754425928,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-nano-2025-08-07",
+          "created": 1754426303,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-nano",
+          "created": 1754426384,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-16k",
+          "created": 1683758102,
+          "object": "model",
+          "owned_by": "openai-internal"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "tts-1",
+          "created": 1681940951,
+          "object": "model",
+          "owned_by": "openai-internal"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "whisper-1",
+          "created": 1677532384,
+          "object": "model",
+          "owned_by": "openai-internal"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "text-embedding-ada-002",
+          "created": 1671217299,
+          "object": "model",
+          "owned_by": "openai-internal"
+        }
+      }
+    ],
+    "is_streaming": false
+  }
+}
diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
index 38ce365c1..5e5914a03 100644
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@@ -37,7 +37,6 @@ from llama_stack.apis.inference import (
     OpenAIJSONSchema,
     OpenAIResponseFormatJSONObject,
     OpenAIResponseFormatJSONSchema,
-    OpenAIResponseFormatText,
     OpenAIUserMessageParam,
 )
 from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime
@@ -148,7 +147,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
     mock_inference_api.openai_chat_completion.assert_called_once_with(
         model=model,
         messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)],
-        response_format=OpenAIResponseFormatText(),
+        response_format=None,
         tools=None,
         stream=True,
         temperature=0.1,
@@ -823,16 +822,16 @@ async def test_store_response_uses_rehydrated_input_with_previous_response(
 @pytest.mark.parametrize(
     "text_format, response_format",
     [
-        (OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")), OpenAIResponseFormatText()),
+        (OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")), None),
         (
             OpenAIResponseText(format=OpenAIResponseTextFormat(name="Test", schema={"foo": "bar"}, type="json_schema")),
             OpenAIResponseFormatJSONSchema(json_schema=OpenAIJSONSchema(name="Test", schema={"foo": "bar"})),
         ),
         (OpenAIResponseText(format=OpenAIResponseTextFormat(type="json_object")), OpenAIResponseFormatJSONObject()),
-        # ensure text param with no format specified defaults to text
-        (OpenAIResponseText(format=None), OpenAIResponseFormatText()),
-        # ensure text param of None defaults to text
-        (None, OpenAIResponseFormatText()),
+        # ensure text param with no format specified defaults to None
+        (OpenAIResponseText(format=None), None),
+        # ensure text param of None defaults to None
+        (None, None),
     ],
 )
 async def test_create_openai_response_with_text_format(
@@ -855,7 +854,6 @@ async def test_create_openai_response_with_text_format(
     # Verify
     first_call = mock_inference_api.openai_chat_completion.call_args_list[0]
     assert first_call.kwargs["messages"][0].content == input_text
-    assert first_call.kwargs["response_format"] is not None
     assert first_call.kwargs["response_format"] == response_format
 
 
From 4f7c177c6231b5ce0de2d897b432224a67ce4967 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Tue, 30 Sep 2025 12:04:50 -0700
Subject: [PATCH 27/33] fix: don't pass default response format in Responses

# What does this PR do?


## Test Plan
---
 .../meta_reference/responses/streaming.py     |    5 +-
 .../recordings/responses/4ebf08272d17.json    | 6030 +++++++++++++++++
 .../recordings/responses/73e97be515d9.json    |  106 +
 .../recordings/responses/8aba89449cdc.json    |  248 +
 .../recordings/responses/cf776b1aa432.json    |  232 +
 .../recordings/responses/d0ac68cbde69.json    |   16 +-
 .../models-7d9446738fd7-d5d684a3.json         |  144 +-
 .../models-bd032f995f2a-7467c0cf.json         |   69 +
 .../models-bd032f995f2a-ebaa996d.json         |  798 +++
 .../meta_reference/test_openai_responses.py   |   14 +-
 10 files changed, 7573 insertions(+), 89 deletions(-)
 create mode 100644 tests/integration/recordings/responses/4ebf08272d17.json
 create mode 100644 tests/integration/recordings/responses/73e97be515d9.json
 create mode 100644 tests/integration/recordings/responses/8aba89449cdc.json
 create mode 100644 tests/integration/recordings/responses/cf776b1aa432.json
 create mode 100644 tests/integration/recordings/responses/models-bd032f995f2a-7467c0cf.json
 create mode 100644 tests/integration/recordings/responses/models-bd032f995f2a-ebaa996d.json

diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
index 2f45ad2a3..179f7f023 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -127,13 +127,16 @@ class StreamingResponseOrchestrator:
         messages = self.ctx.messages.copy()
 
         while True:
+            # Text is the default response format for chat completion so don't need to pass it
+            # (some providers don't support non-empty response_format when tools are present)
+            response_format = None if self.ctx.response_format.type == "text" else self.ctx.response_format
             completion_result = await self.inference_api.openai_chat_completion(
                 model=self.ctx.model,
                 messages=messages,
                 tools=self.ctx.chat_tools,
                 stream=True,
                 temperature=self.ctx.temperature,
-                response_format=self.ctx.response_format,
+                response_format=response_format,
             )
 
             # Process streaming chunks and build complete response
diff --git a/tests/integration/recordings/responses/4ebf08272d17.json b/tests/integration/recordings/responses/4ebf08272d17.json
new file mode 100644
index 000000000..958d3ad9c
--- /dev/null
+++ b/tests/integration/recordings/responses/4ebf08272d17.json
@@ -0,0 +1,6030 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What's the weather in Tokyo?"
+        }
+      ],
+      "stream": true
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "I",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267476,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "'m",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267476,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " not",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267476,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " able",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267476,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " to",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267476,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " provide",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267476,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " real",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267476,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "-time",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267476,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " weather",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267476,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " information",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267477,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ".",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267477,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " However",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267477,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267477,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " I",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267477,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " can",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267477,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " suggest",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267477,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " some",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267477,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " ways",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267477,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " for",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267477,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " you",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267477,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " to",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267477,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " find",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267477,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " out",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267477,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " the",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267477,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " current",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267477,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " weather",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267477,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " in",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267478,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Tokyo",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267478,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ":\n\n",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267478,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "1",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267478,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ".",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267478,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Check",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267478,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " online",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267478,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " weather",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267478,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " websites",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267478,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ":",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267478,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " You",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267478,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " can",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267478,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " check",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267478,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " websites",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267478,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " like",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267478,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Acc",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267478,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "u",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267478,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "Weather",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267479,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267479,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Weather",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267479,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ".com",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267479,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267479,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " or",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267479,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Japan",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267479,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Meteor",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267479,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "ological",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267479,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Agency",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267479,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " (",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267479,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "J",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267479,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "MA",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267479,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ")",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267479,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " for",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267479,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " the",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267479,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " current",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267480,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " weather",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267480,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " conditions",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267480,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " and",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267480,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " forecast",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267480,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " in",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267480,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Tokyo",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267480,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ".\n",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267480,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "2",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267480,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ".",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267480,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Use",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267480,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " a",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267480,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " mobile",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267480,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " app",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267480,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ":",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267480,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " There",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267480,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " are",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267480,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " many",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267481,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " mobile",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267481,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " apps",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267481,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " available",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267481,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " that",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267481,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " provide",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267481,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " real",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267481,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "-time",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267481,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " weather",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267481,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " information",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267481,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267481,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " such",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267481,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " as",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267481,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Dark",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267481,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Sky",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267481,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267481,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Weather",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267482,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Underground",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267482,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267482,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " or",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267482,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Japan",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267482,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "-based",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267482,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " apps",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267482,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " like",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267482,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Japan",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267482,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Meteor",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267482,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "ological",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267482,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Corporation",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267482,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "'s",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267482,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " (",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267482,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "JM",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267482,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "Cor",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267482,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "ps",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267482,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ")",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267483,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Weather",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267483,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " App",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267483,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ".\n",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267483,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "3",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267483,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ".",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267483,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Check",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267483,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " social",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267483,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " media",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267483,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ":",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267483,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Many",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267483,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " airlines",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267483,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267483,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " airports",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267483,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267483,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " and",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267483,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " tourist",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267483,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " attractions",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267484,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " also",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267484,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " share",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267484,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " the",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267484,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " current",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267484,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " weather",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267484,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " conditions",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267484,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " on",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267484,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " their",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267484,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " social",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267484,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " media",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267484,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " accounts",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267484,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ".\n\n",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267484,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "Please",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267484,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " note",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267484,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " that",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267484,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Tokyo",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267484,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "'s",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267485,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " climate",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267485,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " is",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267485,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " humid",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267485,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " subt",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267485,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "ropical",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267485,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " with",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267485,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " four",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267485,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " distinct",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267485,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " seasons",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267485,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ":\n\n",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267485,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "-",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267485,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Winter",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267485,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " (",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267485,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "December",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267485,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " to",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267485,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " February",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267485,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "):",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267486,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Mild",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267486,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " temperatures",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267486,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267486,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " with",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267486,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " average",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267486,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " highs",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267486,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " around",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267486,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " ",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267486,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "9",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267486,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "\u00b0C",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267486,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " (",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267486,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "48",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267486,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "\u00b0F",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267486,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ")",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267486,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " and",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267486,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " lows",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267486,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " around",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267487,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " -",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267487,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "2",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267487,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "\u00b0C",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267487,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " (",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267487,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "28",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267487,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "\u00b0F",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267487,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ").\n",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267487,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "-",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267487,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Spring",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267487,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " (",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267487,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "March",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267487,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " to",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267487,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " May",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267487,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "):",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267487,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Cool",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267487,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " temperature",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267487,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267488,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " with",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267488,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " average",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267488,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " highs",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267488,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " around",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267488,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " ",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267488,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "18",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267488,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "\u00b0C",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267488,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " (",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267488,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "64",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267488,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "\u00b0F",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267488,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ")",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267488,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " and",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267488,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " lows",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267488,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " around",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267488,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " ",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267488,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "8",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267488,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "\u00b0C",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267489,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " (",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267489,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "46",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267489,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "\u00b0F",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267489,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ").\n",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267489,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "-",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267489,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Summer",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267489,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " (",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267489,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "June",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267489,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " to",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267489,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " August",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267489,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "):",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267489,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " Hot",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267489,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " and",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267489,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " humid",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267489,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267489,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " with",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267489,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " average",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267490,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": " highs",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267490,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-359",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267490,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
diff --git a/tests/integration/recordings/responses/73e97be515d9.json b/tests/integration/recordings/responses/73e97be515d9.json
new file mode 100644
index 000000000..6df3dd956
--- /dev/null
+++ b/tests/integration/recordings/responses/73e97be515d9.json
@@ -0,0 +1,106 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What's the weather in Tokyo? YOU MUST USE THE get_weather function to get the weather."
+        }
+      ],
+      "stream": true,
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "type": "function",
+            "name": "get_weather",
+            "description": "Get the weather in a given city",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "city": {
+                  "type": "string",
+                  "description": "The city to get the weather for"
+                }
+              }
+            },
+            "strict": null
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-116",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": "call_0c2qffvv",
+                    "function": {
+                      "arguments": "{\"city\":\"Tokyo\"}",
+                      "name": "get_weather"
+                    },
+                    "type": "function"
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267492,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-116",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267492,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
diff --git a/tests/integration/recordings/responses/8aba89449cdc.json b/tests/integration/recordings/responses/8aba89449cdc.json
new file mode 100644
index 000000000..6aa6cd2c5
--- /dev/null
+++ b/tests/integration/recordings/responses/8aba89449cdc.json
@@ -0,0 +1,248 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Message A: What is the capital of France?"
+        },
+        {
+          "role": "assistant",
+          "content": "The capital of France is Paris."
+        },
+        {
+          "role": "user",
+          "content": "Message B: What about Spain?"
+        },
+        {
+          "role": "assistant",
+          "content": "The capital of Spain is Madrid."
+        },
+        {
+          "role": "user",
+          "content": "Message C: And Italy?"
+        }
+      ],
+      "stream": true
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-676",
+          "choices": [
+            {
+              "delta": {
+                "content": "The",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267544,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-676",
+          "choices": [
+            {
+              "delta": {
+                "content": " capital",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267544,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-676",
+          "choices": [
+            {
+              "delta": {
+                "content": " of",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267544,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-676",
+          "choices": [
+            {
+              "delta": {
+                "content": " Italy",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267544,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-676",
+          "choices": [
+            {
+              "delta": {
+                "content": " is",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267544,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-676",
+          "choices": [
+            {
+              "delta": {
+                "content": " Rome",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267544,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-676",
+          "choices": [
+            {
+              "delta": {
+                "content": ".",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267544,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-676",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759267544,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
diff --git a/tests/integration/recordings/responses/cf776b1aa432.json b/tests/integration/recordings/responses/cf776b1aa432.json
new file mode 100644
index 000000000..c7449427a
--- /dev/null
+++ b/tests/integration/recordings/responses/cf776b1aa432.json
@@ -0,0 +1,232 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What is the capital of France?"
+        }
+      ],
+      "stream": true
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-78",
+          "choices": [
+            {
+              "delta": {
+                "content": "The",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759259077,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-78",
+          "choices": [
+            {
+              "delta": {
+                "content": " capital",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759259077,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-78",
+          "choices": [
+            {
+              "delta": {
+                "content": " of",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759259077,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-78",
+          "choices": [
+            {
+              "delta": {
+                "content": " France",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759259077,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-78",
+          "choices": [
+            {
+              "delta": {
+                "content": " is",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759259077,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-78",
+          "choices": [
+            {
+              "delta": {
+                "content": " Paris",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759259077,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-78",
+          "choices": [
+            {
+              "delta": {
+                "content": ".",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759259077,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-78",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1759259077,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
diff --git a/tests/integration/recordings/responses/d0ac68cbde69.json b/tests/integration/recordings/responses/d0ac68cbde69.json
index 78784e0ca..4dcc6a69b 100644
--- a/tests/integration/recordings/responses/d0ac68cbde69.json
+++ b/tests/integration/recordings/responses/d0ac68cbde69.json
@@ -13,12 +13,12 @@
       "__data__": {
         "models": [
           {
-            "model": "llama3.2:3b",
-            "name": "llama3.2:3b",
-            "digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
-            "expires_at": "2025-09-27T11:54:56.718552-07:00",
-            "size": 3367856128,
-            "size_vram": 3367856128,
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-09-30T14:29:52.682809-07:00",
+            "size": 8581748736,
+            "size_vram": 8581748736,
             "details": {
               "parent_model": "",
               "format": "gguf",
@@ -27,9 +27,9 @@
                 "llama"
               ],
               "parameter_size": "3.2B",
-              "quantization_level": "Q4_K_M"
+              "quantization_level": "F16"
             },
-            "context_length": 4096
+            "context_length": null
           }
         ]
       }
diff --git a/tests/integration/recordings/responses/models-7d9446738fd7-d5d684a3.json b/tests/integration/recordings/responses/models-7d9446738fd7-d5d684a3.json
index a76f0ba8f..d9917b2ec 100644
--- a/tests/integration/recordings/responses/models-7d9446738fd7-d5d684a3.json
+++ b/tests/integration/recordings/responses/models-7d9446738fd7-d5d684a3.json
@@ -22,19 +22,6 @@
           "supports_tools": false
         }
       },
-      {
-        "__type__": "openai.types.model.Model",
-        "__data__": {
-          "id": "accounts/tvergho-87e44d/models/debatecards-70b-ft-3epoch-dpo-v2",
-          "created": 1743381121,
-          "object": "model",
-          "owned_by": "tvergho-87e44d",
-          "kind": "HF_PEFT_ADDON",
-          "supports_chat": true,
-          "supports_image_input": false,
-          "supports_tools": false
-        }
-      },
       {
         "__type__": "openai.types.model.Model",
         "__data__": {
@@ -75,20 +62,6 @@
           "context_length": 131072
         }
       },
-      {
-        "__type__": "openai.types.model.Model",
-        "__data__": {
-          "id": "accounts/fireworks/models/deepseek-v3",
-          "created": 1735576668,
-          "object": "model",
-          "owned_by": "fireworks",
-          "kind": "HF_BASE_MODEL",
-          "supports_chat": true,
-          "supports_image_input": false,
-          "supports_tools": true,
-          "context_length": 131072
-        }
-      },
       {
         "__type__": "openai.types.model.Model",
         "__data__": {
@@ -259,17 +232,45 @@
       {
         "__type__": "openai.types.model.Model",
         "__data__": {
-          "id": "accounts/fireworks/models/qwen3-coder-30b-a3b-instruct",
-          "created": 1754063588,
+          "id": "accounts/fireworks/models/kimi-k2-instruct-0905",
+          "created": 1757018994,
           "object": "model",
           "owned_by": "fireworks",
           "kind": "HF_BASE_MODEL",
           "supports_chat": true,
           "supports_image_input": false,
-          "supports_tools": false,
+          "supports_tools": true,
           "context_length": 262144
         }
       },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/glm-4p5",
+          "created": 1753809636,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 131072
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/deepseek-v3",
+          "created": 1735576668,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": true,
+          "context_length": 131072
+        }
+      },
       {
         "__type__": "openai.types.model.Model",
         "__data__": {
@@ -284,20 +285,6 @@
           "context_length": 131072
         }
       },
-      {
-        "__type__": "openai.types.model.Model",
-        "__data__": {
-          "id": "accounts/fireworks/models/qwen2p5-vl-32b-instruct",
-          "created": 1743392739,
-          "object": "model",
-          "owned_by": "fireworks",
-          "kind": "HF_BASE_MODEL",
-          "supports_chat": true,
-          "supports_image_input": true,
-          "supports_tools": false,
-          "context_length": 128000
-        }
-      },
       {
         "__type__": "openai.types.model.Model",
         "__data__": {
@@ -395,34 +382,6 @@
           "supports_tools": false
         }
       },
-      {
-        "__type__": "openai.types.model.Model",
-        "__data__": {
-          "id": "accounts/fireworks/models/glm-4p5",
-          "created": 1753809636,
-          "object": "model",
-          "owned_by": "fireworks",
-          "kind": "HF_BASE_MODEL",
-          "supports_chat": true,
-          "supports_image_input": false,
-          "supports_tools": true,
-          "context_length": 131072
-        }
-      },
-      {
-        "__type__": "openai.types.model.Model",
-        "__data__": {
-          "id": "accounts/fireworks/models/kimi-k2-instruct-0905",
-          "created": 1757018994,
-          "object": "model",
-          "owned_by": "fireworks",
-          "kind": "HF_BASE_MODEL",
-          "supports_chat": true,
-          "supports_image_input": false,
-          "supports_tools": true,
-          "context_length": 262144
-        }
-      },
       {
         "__type__": "openai.types.model.Model",
         "__data__": {
@@ -520,6 +479,47 @@
           "supports_tools": false,
           "context_length": 262144
         }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/qwen2p5-vl-32b-instruct",
+          "created": 1743392739,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": true,
+          "supports_tools": false,
+          "context_length": 128000
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/fireworks/models/qwen3-coder-30b-a3b-instruct",
+          "created": 1754063588,
+          "object": "model",
+          "owned_by": "fireworks",
+          "kind": "HF_BASE_MODEL",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": false,
+          "context_length": 262144
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "accounts/tvergho-87e44d/models/debatecards-70b-ft-3epoch-dpo-v2",
+          "created": 1743381121,
+          "object": "model",
+          "owned_by": "tvergho-87e44d",
+          "kind": "HF_PEFT_ADDON",
+          "supports_chat": true,
+          "supports_image_input": false,
+          "supports_tools": false
+        }
       }
     ],
     "is_streaming": false
diff --git a/tests/integration/recordings/responses/models-bd032f995f2a-7467c0cf.json b/tests/integration/recordings/responses/models-bd032f995f2a-7467c0cf.json
new file mode 100644
index 000000000..00c447dcc
--- /dev/null
+++ b/tests/integration/recordings/responses/models-bd032f995f2a-7467c0cf.json
@@ -0,0 +1,69 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/models",
+    "headers": {},
+    "body": {},
+    "endpoint": "/v1/models",
+    "model": ""
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "nomic-embed-text:latest",
+          "created": 1754610899,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama-guard3:1b",
+          "created": 1754088388,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "all-minilm:l6-v2",
+          "created": 1753826826,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "all-minilm:latest",
+          "created": 1749064003,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama3.1:8b-instruct-fp16",
+          "created": 1739575404,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama3.2:3b-instruct-fp16",
+          "created": 1737496003,
+          "object": "model",
+          "owned_by": "library"
+        }
+      }
+    ],
+    "is_streaming": false
+  }
+}
diff --git a/tests/integration/recordings/responses/models-bd032f995f2a-ebaa996d.json b/tests/integration/recordings/responses/models-bd032f995f2a-ebaa996d.json
new file mode 100644
index 000000000..c460d6977
--- /dev/null
+++ b/tests/integration/recordings/responses/models-bd032f995f2a-ebaa996d.json
@@ -0,0 +1,798 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.openai.com/v1/v1/models",
+    "headers": {},
+    "body": {},
+    "endpoint": "/v1/models",
+    "model": ""
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-0613",
+          "created": 1686588896,
+          "object": "model",
+          "owned_by": "openai"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4",
+          "created": 1687882411,
+          "object": "model",
+          "owned_by": "openai"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo",
+          "created": 1677610602,
+          "object": "model",
+          "owned_by": "openai"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-codex",
+          "created": 1757527818,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-audio-2025-08-28",
+          "created": 1756256146,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-realtime",
+          "created": 1756271701,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-realtime-2025-08-28",
+          "created": 1756271773,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-audio",
+          "created": 1756339249,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "davinci-002",
+          "created": 1692634301,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "babbage-002",
+          "created": 1692634615,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-instruct",
+          "created": 1692901427,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-instruct-0914",
+          "created": 1694122472,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "dall-e-3",
+          "created": 1698785189,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "dall-e-2",
+          "created": 1698798177,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-1106-preview",
+          "created": 1698957206,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-1106",
+          "created": 1698959748,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "tts-1-hd",
+          "created": 1699046015,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "tts-1-1106",
+          "created": 1699053241,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "tts-1-hd-1106",
+          "created": 1699053533,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "text-embedding-3-small",
+          "created": 1705948997,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "text-embedding-3-large",
+          "created": 1705953180,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-0125-preview",
+          "created": 1706037612,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-turbo-preview",
+          "created": 1706037777,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-0125",
+          "created": 1706048358,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-turbo",
+          "created": 1712361441,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4-turbo-2024-04-09",
+          "created": 1712601677,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o",
+          "created": 1715367049,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-2024-05-13",
+          "created": 1715368132,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-2024-07-18",
+          "created": 1721172717,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini",
+          "created": 1721172741,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-2024-08-06",
+          "created": 1722814719,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "chatgpt-4o-latest",
+          "created": 1723515131,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-mini-2024-09-12",
+          "created": 1725648979,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-mini",
+          "created": 1725649008,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-realtime-preview-2024-10-01",
+          "created": 1727131766,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-audio-preview-2024-10-01",
+          "created": 1727389042,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-audio-preview",
+          "created": 1727460443,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-realtime-preview",
+          "created": 1727659998,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "omni-moderation-latest",
+          "created": 1731689265,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "omni-moderation-2024-09-26",
+          "created": 1732734466,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-realtime-preview-2024-12-17",
+          "created": 1733945430,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-audio-preview-2024-12-17",
+          "created": 1734034239,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-realtime-preview-2024-12-17",
+          "created": 1734112601,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-audio-preview-2024-12-17",
+          "created": 1734115920,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-2024-12-17",
+          "created": 1734326976,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1",
+          "created": 1734375816,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-realtime-preview",
+          "created": 1734387380,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-audio-preview",
+          "created": 1734387424,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3-mini",
+          "created": 1737146383,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3-mini-2025-01-31",
+          "created": 1738010200,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-2024-11-20",
+          "created": 1739331543,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-search-preview-2025-03-11",
+          "created": 1741388170,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-search-preview",
+          "created": 1741388720,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-search-preview-2025-03-11",
+          "created": 1741390858,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-search-preview",
+          "created": 1741391161,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-transcribe",
+          "created": 1742068463,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-transcribe",
+          "created": 1742068596,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-pro-2025-03-19",
+          "created": 1742251504,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o1-pro",
+          "created": 1742251791,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-mini-tts",
+          "created": 1742403959,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3-2025-04-16",
+          "created": 1744133301,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o4-mini-2025-04-16",
+          "created": 1744133506,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o3",
+          "created": 1744225308,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o4-mini",
+          "created": 1744225351,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-2025-04-14",
+          "created": 1744315746,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1",
+          "created": 1744316542,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-mini-2025-04-14",
+          "created": 1744317547,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-mini",
+          "created": 1744318173,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-nano-2025-04-14",
+          "created": 1744321025,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4.1-nano",
+          "created": 1744321707,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-image-1",
+          "created": 1745517030,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "codex-mini-latest",
+          "created": 1746673257,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-realtime-preview-2025-06-03",
+          "created": 1748907838,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-4o-audio-preview-2025-06-03",
+          "created": 1748908498,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o4-mini-deep-research",
+          "created": 1749685485,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "o4-mini-deep-research-2025-06-26",
+          "created": 1750866121,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-chat-latest",
+          "created": 1754073306,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-2025-08-07",
+          "created": 1754075360,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5",
+          "created": 1754425777,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-mini-2025-08-07",
+          "created": 1754425867,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-mini",
+          "created": 1754425928,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-nano-2025-08-07",
+          "created": 1754426303,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-5-nano",
+          "created": 1754426384,
+          "object": "model",
+          "owned_by": "system"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-3.5-turbo-16k",
+          "created": 1683758102,
+          "object": "model",
+          "owned_by": "openai-internal"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "tts-1",
+          "created": 1681940951,
+          "object": "model",
+          "owned_by": "openai-internal"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "whisper-1",
+          "created": 1677532384,
+          "object": "model",
+          "owned_by": "openai-internal"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "text-embedding-ada-002",
+          "created": 1671217299,
+          "object": "model",
+          "owned_by": "openai-internal"
+        }
+      }
+    ],
+    "is_streaming": false
+  }
+}
diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
index 38ce365c1..5e5914a03 100644
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@@ -37,7 +37,6 @@ from llama_stack.apis.inference import (
     OpenAIJSONSchema,
     OpenAIResponseFormatJSONObject,
     OpenAIResponseFormatJSONSchema,
-    OpenAIResponseFormatText,
     OpenAIUserMessageParam,
 )
 from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime
@@ -148,7 +147,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
     mock_inference_api.openai_chat_completion.assert_called_once_with(
         model=model,
         messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)],
-        response_format=OpenAIResponseFormatText(),
+        response_format=None,
         tools=None,
         stream=True,
         temperature=0.1,
@@ -823,16 +822,16 @@ async def test_store_response_uses_rehydrated_input_with_previous_response(
 @pytest.mark.parametrize(
     "text_format, response_format",
     [
-        (OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")), OpenAIResponseFormatText()),
+        (OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")), None),
         (
             OpenAIResponseText(format=OpenAIResponseTextFormat(name="Test", schema={"foo": "bar"}, type="json_schema")),
             OpenAIResponseFormatJSONSchema(json_schema=OpenAIJSONSchema(name="Test", schema={"foo": "bar"})),
         ),
         (OpenAIResponseText(format=OpenAIResponseTextFormat(type="json_object")), OpenAIResponseFormatJSONObject()),
-        # ensure text param with no format specified defaults to text
-        (OpenAIResponseText(format=None), OpenAIResponseFormatText()),
-        # ensure text param of None defaults to text
-        (None, OpenAIResponseFormatText()),
+        # ensure text param with no format specified defaults to None
+        (OpenAIResponseText(format=None), None),
+        # ensure text param of None defaults to None
+        (None, None),
     ],
 )
 async def test_create_openai_response_with_text_format(
@@ -855,7 +854,6 @@ async def test_create_openai_response_with_text_format(
     # Verify
     first_call = mock_inference_api.openai_chat_completion.call_args_list[0]
     assert first_call.kwargs["messages"][0].content == input_text
-    assert first_call.kwargs["response_format"] is not None
     assert first_call.kwargs["response_format"] == response_format
 
 
From 168b42cd990add51575a5f83ca1c9afa33d5f91e Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Wed, 1 Oct 2025 09:28:58 -0700
Subject: [PATCH 28/33] fix: log level

# What does this PR do?
- categories like "core::server" is not recognized so it's level is not set by 'all=debug'
- removed spammy telemetry debug logging

## Test Plan
test server launched with LLAMA_STACK_LOGGING='all=debug'
---
 llama_stack/log.py                               | 11 ++++++++++-
 llama_stack/providers/utils/telemetry/tracing.py |  3 ---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/llama_stack/log.py b/llama_stack/log.py
index cc4c9d4cf..2a11516fa 100644
--- a/llama_stack/log.py
+++ b/llama_stack/log.py
@@ -247,7 +247,16 @@ def get_logger(
         _category_levels.update(parse_yaml_config(config))
 
     logger = logging.getLogger(name)
-    logger.setLevel(_category_levels.get(category, DEFAULT_LOG_LEVEL))
+    if category in _category_levels:
+        log_level = _category_levels[category]
+    else:
+        root_category = category.split("::")[0]
+        if root_category in _category_levels:
+            log_level = _category_levels[root_category]
+        else:
+            log_level = _category_levels.get("root", DEFAULT_LOG_LEVEL)
+            logging.warning(f"Unknown logging category: {category}. Falling back to default 'root' level: {log_level}")
+    logger.setLevel(log_level)
     return logging.LoggerAdapter(logger, {"category": category})
 
 
diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py
index 62cceb13e..58bf8603a 100644
--- a/llama_stack/providers/utils/telemetry/tracing.py
+++ b/llama_stack/providers/utils/telemetry/tracing.py
@@ -317,7 +317,6 @@ class SpanContextManager:
         global CURRENT_TRACE_CONTEXT
         context = CURRENT_TRACE_CONTEXT.get()
         if not context:
-            logger.debug("No trace context to pop span")
             return
 
         context.pop_span()
@@ -332,7 +331,6 @@ class SpanContextManager:
         global CURRENT_TRACE_CONTEXT
         context = CURRENT_TRACE_CONTEXT.get()
         if not context:
-            logger.debug("No trace context to push span")
             return self
 
         self.span = context.push_span(self.name, self.attributes)
@@ -342,7 +340,6 @@ class SpanContextManager:
         global CURRENT_TRACE_CONTEXT
         context = CURRENT_TRACE_CONTEXT.get()
         if not context:
-            logger.debug("No trace context to pop span")
             return
 
         context.pop_span()

From f675b09b69233665a1a08f2722d7e56623f6014a Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Wed, 1 Oct 2025 09:39:50 -0700
Subject: [PATCH 29/33] fix: log level

# What does this PR do?
- categories like "core::server" is not recognized so it's level is not set by 'all=debug'
- removed spammy telemetry debug logging

## Test Plan
test server launched with LLAMA_STACK_LOGGING='all=debug'
---
 llama_stack/log.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/llama_stack/log.py b/llama_stack/log.py
index cc4c9d4cf..2a11516fa 100644
--- a/llama_stack/log.py
+++ b/llama_stack/log.py
@@ -247,7 +247,16 @@ def get_logger(
         _category_levels.update(parse_yaml_config(config))
 
     logger = logging.getLogger(name)
-    logger.setLevel(_category_levels.get(category, DEFAULT_LOG_LEVEL))
+    if category in _category_levels:
+        log_level = _category_levels[category]
+    else:
+        root_category = category.split("::")[0]
+        if root_category in _category_levels:
+            log_level = _category_levels[root_category]
+        else:
+            log_level = _category_levels.get("root", DEFAULT_LOG_LEVEL)
+            logging.warning(f"Unknown logging category: {category}. Falling back to default 'root' level: {log_level}")
+    logger.setLevel(log_level)
     return logging.LoggerAdapter(logger, {"category": category})
 
 
From ccaa9208f77926a3b562c13368b817277b733d1c Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Thu, 2 Oct 2025 11:01:57 -0700
Subject: [PATCH 30/33] chore: fix/add logging categories

# What does this PR do?


## Test Plan
---
 llama_stack/log.py                                 | 14 ++++++++++++--
 .../meta_reference/responses/openai_responses.py   |  2 +-
 .../providers/utils/inference/inference_store.py   |  2 +-
 .../providers/utils/responses/responses_store.py   |  2 +-
 4 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/llama_stack/log.py b/llama_stack/log.py
index 2a11516fa..729b2b8c5 100644
--- a/llama_stack/log.py
+++ b/llama_stack/log.py
@@ -31,7 +31,14 @@ CATEGORIES = [
     "client",
     "telemetry",
     "openai_responses",
+    "testing",
+    "providers",
+    "models",
+    "files",
+    "vector_io",
+    "tool_runtime",
 ]
+UNCATEGORIZED = "uncategorized"
 
 # Initialize category levels with default level
 _category_levels: dict[str, int] = dict.fromkeys(CATEGORIES, DEFAULT_LOG_LEVEL)
@@ -165,7 +172,7 @@ def setup_logging(category_levels: dict[str, int], log_file: str | None) -> None
 
         def filter(self, record):
             if not hasattr(record, "category"):
-                record.category = "uncategorized"  # Default to 'uncategorized' if no category found
+                record.category = UNCATEGORIZED  # Default to 'uncategorized' if no category found
             return True
 
     # Determine the root logger's level (default to WARNING if not specified)
@@ -255,7 +262,10 @@ def get_logger(
             log_level = _category_levels[root_category]
         else:
             log_level = _category_levels.get("root", DEFAULT_LOG_LEVEL)
-            logging.warning(f"Unknown logging category: {category}. Falling back to default 'root' level: {log_level}")
+            if category != UNCATEGORIZED:
+                logging.warning(
+                    f"Unknown logging category: {category}. Falling back to default 'root' level: {log_level}"
+                )
     logger.setLevel(log_level)
     return logging.LoggerAdapter(logger, {"category": category})
 
diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
index c27dc8467..1a6d75710 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@@ -41,7 +41,7 @@ from .utils import (
     convert_response_text_to_chat_response_format,
 )
 
-logger = get_logger(name=__name__, category="openai::responses")
+logger = get_logger(name=__name__, category="openai_responses")
 
 
 class OpenAIResponsePreviousResponseWithInputItems(BaseModel):
diff --git a/llama_stack/providers/utils/inference/inference_store.py b/llama_stack/providers/utils/inference/inference_store.py
index ffc9f3e11..901f77c67 100644
--- a/llama_stack/providers/utils/inference/inference_store.py
+++ b/llama_stack/providers/utils/inference/inference_store.py
@@ -22,7 +22,7 @@ from ..sqlstore.api import ColumnDefinition, ColumnType
 from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
 from ..sqlstore.sqlstore import SqlStoreConfig, SqlStoreType, sqlstore_impl
 
-logger = get_logger(name=__name__, category="inference_store")
+logger = get_logger(name=__name__, category="inference")
 
 
 class InferenceStore:
diff --git a/llama_stack/providers/utils/responses/responses_store.py b/llama_stack/providers/utils/responses/responses_store.py
index b9fceb1ab..cb665b88e 100644
--- a/llama_stack/providers/utils/responses/responses_store.py
+++ b/llama_stack/providers/utils/responses/responses_store.py
@@ -25,7 +25,7 @@ from ..sqlstore.api import ColumnDefinition, ColumnType
 from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
 from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, SqlStoreType, sqlstore_impl
 
-logger = get_logger(name=__name__, category="responses_store")
+logger = get_logger(name=__name__, category="openai_responses")
 
 
 class ResponsesStore:

From 631e9d7762966089abc133337e6a099f9c568f23 Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Thu, 2 Oct 2025 14:50:38 -0700
Subject: [PATCH 31/33] chore: fix precommit

# What does this PR do?


## Test Plan
---
 docs/docs/providers/agents/index.mdx    |  4 ++--
 docs/docs/providers/batches/index.mdx   | 24 ++++++++++++------------
 docs/docs/providers/inference/index.mdx | 12 ++++++------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/docs/docs/providers/agents/index.mdx b/docs/docs/providers/agents/index.mdx
index 52b92734e..06eb104af 100644
--- a/docs/docs/providers/agents/index.mdx
+++ b/docs/docs/providers/agents/index.mdx
@@ -1,7 +1,7 @@
 ---
 description: "Agents
 
-APIs for creating and interacting with agentic systems."
+    APIs for creating and interacting with agentic systems."
 sidebar_label: Agents
 title: Agents
 ---
@@ -12,6 +12,6 @@ title: Agents
 
 Agents
 
-APIs for creating and interacting with agentic systems.
+    APIs for creating and interacting with agentic systems.
 
 This section contains documentation for all available providers for the **agents** API.
diff --git a/docs/docs/providers/batches/index.mdx b/docs/docs/providers/batches/index.mdx
index 18e5e314d..2c64b277f 100644
--- a/docs/docs/providers/batches/index.mdx
+++ b/docs/docs/providers/batches/index.mdx
@@ -1,14 +1,14 @@
 ---
 description: "The Batches API enables efficient processing of multiple requests in a single operation,
-particularly useful for processing large datasets, batch evaluation workflows, and
-cost-effective inference at scale.
+    particularly useful for processing large datasets, batch evaluation workflows, and
+    cost-effective inference at scale.
 
-The API is designed to allow use of openai client libraries for seamless integration.
+    The API is designed to allow use of openai client libraries for seamless integration.
 
-This API provides the following extensions:
- - idempotent batch creation
+    This API provides the following extensions:
+     - idempotent batch creation
 
-Note: This API is currently under active development and may undergo changes."
+    Note: This API is currently under active development and may undergo changes."
 sidebar_label: Batches
 title: Batches
 ---
@@ -18,14 +18,14 @@ title: Batches
 ## Overview
 
 The Batches API enables efficient processing of multiple requests in a single operation,
-particularly useful for processing large datasets, batch evaluation workflows, and
-cost-effective inference at scale.
+    particularly useful for processing large datasets, batch evaluation workflows, and
+    cost-effective inference at scale.
 
-The API is designed to allow use of openai client libraries for seamless integration.
+    The API is designed to allow use of openai client libraries for seamless integration.
 
-This API provides the following extensions:
- - idempotent batch creation
+    This API provides the following extensions:
+     - idempotent batch creation
 
-Note: This API is currently under active development and may undergo changes.
+    Note: This API is currently under active development and may undergo changes.
 
 This section contains documentation for all available providers for the **batches** API.
diff --git a/docs/docs/providers/inference/index.mdx b/docs/docs/providers/inference/index.mdx
index 1dc479675..ebbaf1be1 100644
--- a/docs/docs/providers/inference/index.mdx
+++ b/docs/docs/providers/inference/index.mdx
@@ -1,9 +1,9 @@
 ---
 description: "Llama Stack Inference API for generating completions, chat completions, and embeddings.
 
-This API provides the raw interface to the underlying models. Two kinds of models are supported:
-- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
-- Embedding models: these models generate embeddings to be used for semantic search."
+    This API provides the raw interface to the underlying models. Two kinds of models are supported:
+    - LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
+    - Embedding models: these models generate embeddings to be used for semantic search."
 sidebar_label: Inference
 title: Inference
 ---
@@ -14,8 +14,8 @@ title: Inference
 
 Llama Stack Inference API for generating completions, chat completions, and embeddings.
 
-This API provides the raw interface to the underlying models. Two kinds of models are supported:
-- LLM models: these models generate "raw" and "chat" (conversational) completions.
-- Embedding models: these models generate embeddings to be used for semantic search.
+    This API provides the raw interface to the underlying models. Two kinds of models are supported:
+    - LLM models: these models generate "raw" and "chat" (conversational) completions.
+    - Embedding models: these models generate embeddings to be used for semantic search.
 
 This section contains documentation for all available providers for the **inference** API.

From fdafbd6ec2b2f070b531f5fbc71a3d566c23dc2f Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Mon, 6 Oct 2025 14:30:20 -0700
Subject: [PATCH 32/33] chore: remove --env from `llama stack run`

# What does this PR do?


## Test Plan
---
 docs/docs/building_applications/tools.mdx     |  9 ++--
 docs/docs/contributing/new_api_provider.mdx   |  2 +-
 docs/docs/distributions/building_distro.mdx   | 17 ++++---
 docs/docs/distributions/configuration.mdx     |  9 ++--
 .../remote_hosted_distro/watsonx.md           |  8 ++--
 .../distributions/self_hosted_distro/dell.md  | 44 +++++++++----------
 .../self_hosted_distro/meta-reference-gpu.md  | 20 ++++-----
 .../self_hosted_distro/nvidia.md              | 10 ++---
 .../getting_started/detailed_tutorial.mdx     |  8 ++--
 docs/getting_started_llama4.ipynb             |  2 +-
 docs/zero_to_hero_guide/README.md             |  8 ++--
 llama_stack/cli/stack/run.py                  | 30 +------------
 llama_stack/core/stack.py                     | 16 -------
 llama_stack/core/start_stack.sh               | 13 +-----
 .../distributions/dell/doc_template.md        | 42 +++++++++---------
 .../meta-reference-gpu/doc_template.md        | 20 ++++-----
 .../distributions/nvidia/doc_template.md      | 10 ++---
 scripts/install.sh                            |  4 +-
 18 files changed, 105 insertions(+), 167 deletions(-)

diff --git a/docs/docs/building_applications/tools.mdx b/docs/docs/building_applications/tools.mdx
index e5d9c46f9..3b78ec57b 100644
--- a/docs/docs/building_applications/tools.mdx
+++ b/docs/docs/building_applications/tools.mdx
@@ -219,13 +219,10 @@ group_tools = client.tools.list_tools(toolgroup_id="search_tools")
 <TabItem value="setup" label="Setup & Configuration">
 
 1. Start by registering a Tavily API key at [Tavily](https://tavily.com/).
-2. [Optional] Provide the API key directly to the Llama Stack server
+2. [Optional] Set the API key in your environment before starting the Llama Stack server
 ```bash
 export TAVILY_SEARCH_API_KEY="your key"
 ```
-```bash
---env TAVILY_SEARCH_API_KEY=${TAVILY_SEARCH_API_KEY}
-```
 
 </TabItem>
 <TabItem value="implementation" label="Implementation">
@@ -273,9 +270,9 @@ for log in EventLogger().log(response):
 <TabItem value="setup" label="Setup & Configuration">
 
 1. Start by registering for a WolframAlpha API key at [WolframAlpha Developer Portal](https://developer.wolframalpha.com/access).
-2. Provide the API key either when starting the Llama Stack server:
+2. Provide the API key either by setting it in your environment before starting the Llama Stack server:
     ```bash
-    --env WOLFRAM_ALPHA_API_KEY=${WOLFRAM_ALPHA_API_KEY}
+    export WOLFRAM_ALPHA_API_KEY="your key"
     ```
     or from the client side:
     ```python
diff --git a/docs/docs/contributing/new_api_provider.mdx b/docs/docs/contributing/new_api_provider.mdx
index 4ae6d5e72..6f9744771 100644
--- a/docs/docs/contributing/new_api_provider.mdx
+++ b/docs/docs/contributing/new_api_provider.mdx
@@ -76,7 +76,7 @@ Integration tests are located in [tests/integration](https://github.com/meta-lla
 Consult [tests/integration/README.md](https://github.com/meta-llama/llama-stack/blob/main/tests/integration/README.md) for more details on how to run the tests.
 
 Note that each provider's `sample_run_config()` method (in the configuration class for that provider)
- typically references some environment variables for specifying API keys and the like. You can set these in the environment or pass these via the `--env` flag to the test command.
+ typically references some environment variables for specifying API keys and the like. You can set these in the environment before running the test command.
 
 
 ### 2. Unit Testing
diff --git a/docs/docs/distributions/building_distro.mdx b/docs/docs/distributions/building_distro.mdx
index 5b65b7f16..5ffb623b5 100644
--- a/docs/docs/distributions/building_distro.mdx
+++ b/docs/docs/distributions/building_distro.mdx
@@ -289,10 +289,10 @@ After this step is successful, you should be able to find the built container im
 docker run -d \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ~/.llama:/root/.llama \
+  -e INFERENCE_MODEL=$INFERENCE_MODEL \
+  -e OLLAMA_URL=http://host.docker.internal:11434 \
   localhost/distribution-ollama:dev \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env OLLAMA_URL=http://host.docker.internal:11434
+  --port $LLAMA_STACK_PORT
 ```
 
 Here are the docker flags and their uses:
@@ -305,12 +305,12 @@ Here are the docker flags and their uses:
 
 * `localhost/distribution-ollama:dev`: The name and tag of the container image to run
 
+* `-e INFERENCE_MODEL=$INFERENCE_MODEL`: Sets the INFERENCE_MODEL environment variable in the container
+
+* `-e OLLAMA_URL=http://host.docker.internal:11434`: Sets the OLLAMA_URL environment variable in the container
+
 * `--port $LLAMA_STACK_PORT`: Port number for the server to listen on
 
-* `--env INFERENCE_MODEL=$INFERENCE_MODEL`: Sets the model to use for inference
-
-* `--env OLLAMA_URL=http://host.docker.internal:11434`: Configures the URL for the Ollama service
-
 </TabItem>
 </Tabs>
 
@@ -320,7 +320,7 @@ Now, let's start the Llama Stack Distribution Server. You will need the YAML con
 
 ```
 llama stack run -h
-usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--env KEY=VALUE]
+usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME]
                        [--image-type {venv}] [--enable-ui]
                        [config | template]
 
@@ -334,7 +334,6 @@ options:
   --port PORT           Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
   --image-name IMAGE_NAME
                         Name of the image to run. Defaults to the current environment (default: None)
-  --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: None)
   --image-type {venv}
                         Image Type used during the build. This should be venv. (default: None)
   --enable-ui           Start the UI server (default: False)
diff --git a/docs/docs/distributions/configuration.mdx b/docs/docs/distributions/configuration.mdx
index dbf879024..81243c97b 100644
--- a/docs/docs/distributions/configuration.mdx
+++ b/docs/docs/distributions/configuration.mdx
@@ -101,7 +101,7 @@ A few things to note:
 - The id is a string you can choose freely.
 - You can instantiate any number of provider instances of the same type.
 - The configuration dictionary is provider-specific.
-- Notice that configuration can reference environment variables (with default values), which are expanded at runtime. When you run a stack server (via docker or via `llama stack run`), you can specify `--env OLLAMA_URL=http://my-server:11434` to override the default value.
+- Notice that configuration can reference environment variables (with default values), which are expanded at runtime. When you run a stack server, you can set environment variables in your shell before running `llama stack run` to override the default values.
 
 ### Environment Variable Substitution
 
@@ -173,13 +173,10 @@ optional_token: ${env.OPTIONAL_TOKEN:+}
 
 #### Runtime Override
 
-You can override environment variables at runtime when starting the server:
+You can override environment variables at runtime by setting them in your shell before starting the server:
 
 ```bash
-# Override specific environment variables
-llama stack run --config run.yaml --env API_KEY=sk-123 --env BASE_URL=https://custom-api.com
-
-# Or set them in your shell
+# Set environment variables in your shell
 export API_KEY=sk-123
 export BASE_URL=https://custom-api.com
 llama stack run --config run.yaml
diff --git a/docs/docs/distributions/remote_hosted_distro/watsonx.md b/docs/docs/distributions/remote_hosted_distro/watsonx.md
index 977af90dd..5add678f3 100644
--- a/docs/docs/distributions/remote_hosted_distro/watsonx.md
+++ b/docs/docs/distributions/remote_hosted_distro/watsonx.md
@@ -69,10 +69,10 @@ docker run \
   -it \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./run.yaml:/root/my-run.yaml \
+  -e WATSONX_API_KEY=$WATSONX_API_KEY \
+  -e WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
+  -e WATSONX_BASE_URL=$WATSONX_BASE_URL \
   llamastack/distribution-watsonx \
   --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env WATSONX_API_KEY=$WATSONX_API_KEY \
-  --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
-  --env WATSONX_BASE_URL=$WATSONX_BASE_URL
+  --port $LLAMA_STACK_PORT
 ```
diff --git a/docs/docs/distributions/self_hosted_distro/dell.md b/docs/docs/distributions/self_hosted_distro/dell.md
index 52d40cf9d..851eac3bf 100644
--- a/docs/docs/distributions/self_hosted_distro/dell.md
+++ b/docs/docs/distributions/self_hosted_distro/dell.md
@@ -129,11 +129,11 @@ docker run -it \
   # NOTE: mount the llama-stack / llama-model directories if testing local changes else not needed
   -v $HOME/git/llama-stack:/app/llama-stack-source -v $HOME/git/llama-models:/app/llama-models-source \
   # localhost/distribution-dell:dev if building / testing locally
-  llamastack/distribution-dell\
-  --port $LLAMA_STACK_PORT  \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env DEH_URL=$DEH_URL \
-  --env CHROMA_URL=$CHROMA_URL
+  -e INFERENCE_MODEL=$INFERENCE_MODEL \
+  -e DEH_URL=$DEH_URL \
+  -e CHROMA_URL=$CHROMA_URL \
+  llamastack/distribution-dell \
+  --port $LLAMA_STACK_PORT
 
 ```
 
@@ -154,14 +154,14 @@ docker run \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v $HOME/.llama:/root/.llama \
   -v ./llama_stack/distributions/tgi/run-with-safety.yaml:/root/my-run.yaml \
+  -e INFERENCE_MODEL=$INFERENCE_MODEL \
+  -e DEH_URL=$DEH_URL \
+  -e SAFETY_MODEL=$SAFETY_MODEL \
+  -e DEH_SAFETY_URL=$DEH_SAFETY_URL \
+  -e CHROMA_URL=$CHROMA_URL \
   llamastack/distribution-dell \
   --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env DEH_URL=$DEH_URL \
-  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env DEH_SAFETY_URL=$DEH_SAFETY_URL \
-  --env CHROMA_URL=$CHROMA_URL
+  --port $LLAMA_STACK_PORT
 ```
 
 ### Via venv
@@ -170,21 +170,21 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 
 ```bash
 llama stack build --distro dell --image-type venv
-llama stack run dell
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env DEH_URL=$DEH_URL \
-  --env CHROMA_URL=$CHROMA_URL
+INFERENCE_MODEL=$INFERENCE_MODEL \
+DEH_URL=$DEH_URL \
+CHROMA_URL=$CHROMA_URL \
+llama stack run dell \
+  --port $LLAMA_STACK_PORT
 ```
 
 If you are using Llama Stack Safety / Shield APIs, use:
 
 ```bash
+INFERENCE_MODEL=$INFERENCE_MODEL \
+DEH_URL=$DEH_URL \
+SAFETY_MODEL=$SAFETY_MODEL \
+DEH_SAFETY_URL=$DEH_SAFETY_URL \
+CHROMA_URL=$CHROMA_URL \
 llama stack run ./run-with-safety.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env DEH_URL=$DEH_URL \
-  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env DEH_SAFETY_URL=$DEH_SAFETY_URL \
-  --env CHROMA_URL=$CHROMA_URL
+  --port $LLAMA_STACK_PORT
 ```
diff --git a/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
index 84b85b91c..1c0ef5f6e 100644
--- a/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
@@ -84,9 +84,9 @@ docker run \
   --gpu all \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ~/.llama:/root/.llama \
+  -e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
   llamastack/distribution-meta-reference-gpu \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --port $LLAMA_STACK_PORT
 ```
 
 If you are using Llama Stack Safety / Shield APIs, use:
@@ -98,10 +98,10 @@ docker run \
   --gpu all \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ~/.llama:/root/.llama \
+  -e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  -e SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
   llamastack/distribution-meta-reference-gpu \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --port $LLAMA_STACK_PORT
 ```
 
 ### Via venv
@@ -110,16 +110,16 @@ Make sure you have done `uv pip install llama-stack` and have the Llama Stack CL
 
 ```bash
 llama stack build --distro meta-reference-gpu --image-type venv
+INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
 llama stack run distributions/meta-reference-gpu/run.yaml \
-  --port 8321 \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --port 8321
 ```
 
 If you are using Llama Stack Safety / Shield APIs, use:
 
 ```bash
+INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
 llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \
-  --port 8321 \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --port 8321
 ```
diff --git a/docs/docs/distributions/self_hosted_distro/nvidia.md b/docs/docs/distributions/self_hosted_distro/nvidia.md
index 1e52797db..a6e185442 100644
--- a/docs/docs/distributions/self_hosted_distro/nvidia.md
+++ b/docs/docs/distributions/self_hosted_distro/nvidia.md
@@ -129,10 +129,10 @@ docker run \
   --pull always \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./run.yaml:/root/my-run.yaml \
+  -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
   llamastack/distribution-nvidia \
   --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
+  --port $LLAMA_STACK_PORT
 ```
 
 ### Via venv
@@ -142,10 +142,10 @@ If you've set up your local development environment, you can also build the imag
 ```bash
 INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
 llama stack build --distro nvidia --image-type venv
+NVIDIA_API_KEY=$NVIDIA_API_KEY \
+INFERENCE_MODEL=$INFERENCE_MODEL \
 llama stack run ./run.yaml \
-  --port 8321 \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
+  --port 8321
 ```
 
 ## Example Notebooks
diff --git a/docs/docs/getting_started/detailed_tutorial.mdx b/docs/docs/getting_started/detailed_tutorial.mdx
index 33786ac0e..e6c22224d 100644
--- a/docs/docs/getting_started/detailed_tutorial.mdx
+++ b/docs/docs/getting_started/detailed_tutorial.mdx
@@ -86,9 +86,9 @@ docker run -it \
   --pull always \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ~/.llama:/root/.llama \
+  -e OLLAMA_URL=http://host.docker.internal:11434 \
   llamastack/distribution-starter \
-  --port $LLAMA_STACK_PORT \
-  --env OLLAMA_URL=http://host.docker.internal:11434
+  --port $LLAMA_STACK_PORT
 ```
 Note to start the container with Podman, you can do the same but replace `docker` at the start of the command with
 `podman`. If you are using `podman` older than `4.7.0`, please also replace `host.docker.internal` in the `OLLAMA_URL`
@@ -106,9 +106,9 @@ docker run -it \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ~/.llama:/root/.llama \
   --network=host \
+  -e OLLAMA_URL=http://localhost:11434 \
   llamastack/distribution-starter \
-  --port $LLAMA_STACK_PORT \
-  --env OLLAMA_URL=http://localhost:11434
+  --port $LLAMA_STACK_PORT
 ```
 :::
 You will see output like below:
diff --git a/docs/getting_started_llama4.ipynb b/docs/getting_started_llama4.ipynb
index cd5f83517..b840117f1 100644
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
@@ -238,7 +238,7 @@
         "def run_llama_stack_server_background():\n",
         "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
         "    process = subprocess.Popen(\n",
-        "        f\"uv run --with llama-stack llama stack run meta-reference-gpu --image-type venv --env INFERENCE_MODEL={model_id}\",\n",
+        "        f\"INFERENCE_MODEL={model_id} uv run --with llama-stack llama stack run meta-reference-gpu --image-type venv\",\n",
         "        shell=True,\n",
         "        stdout=log_file,\n",
         "        stderr=log_file,\n",
diff --git a/docs/zero_to_hero_guide/README.md b/docs/zero_to_hero_guide/README.md
index 183038a88..a899d3ebe 100644
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@@ -102,12 +102,12 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
 3. **Run the Llama Stack**:
    Run the stack using uv:
    ```bash
+   INFERENCE_MODEL=$INFERENCE_MODEL \
+   SAFETY_MODEL=$SAFETY_MODEL \
+   OLLAMA_URL=$OLLAMA_URL \
    uv run --with llama-stack llama stack run starter \
       --image-type venv \
-      --port $LLAMA_STACK_PORT \
-      --env INFERENCE_MODEL=$INFERENCE_MODEL \
-      --env SAFETY_MODEL=$SAFETY_MODEL \
-      --env OLLAMA_URL=$OLLAMA_URL
+      --port $LLAMA_STACK_PORT
    ```
    Note: Every time you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model.
 
diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py
index cec101083..677f5e5fa 100644
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@@ -16,7 +16,7 @@ import yaml
 from llama_stack.cli.stack.utils import ImageType
 from llama_stack.cli.subcommand import Subcommand
 from llama_stack.core.datatypes import LoggingConfig, StackRunConfig
-from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars, validate_env_pair
+from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars
 from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
 from llama_stack.log import get_logger
 
@@ -57,12 +57,6 @@ class StackRun(Subcommand):
             default=None,
             help="Name of the image to run. Defaults to the current environment",
         )
-        self.parser.add_argument(
-            "--env",
-            action="append",
-            help="Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.",
-            metavar="KEY=VALUE",
-        )
         self.parser.add_argument(
             "--image-type",
             type=str,
@@ -162,34 +156,12 @@ class StackRun(Subcommand):
             if config_file:
                 run_args.extend(["--config", str(config_file)])
 
-            if args.env:
-                for env_var in args.env:
-                    if "=" not in env_var:
-                        self.parser.error(f"Environment variable '{env_var}' must be in KEY=VALUE format")
-                        return
-                    key, value = env_var.split("=", 1)  # split on first = only
-                    if not key:
-                        self.parser.error(f"Environment variable '{env_var}' has empty key")
-                        return
-                    run_args.extend(["--env", f"{key}={value}"])
-
             run_command(run_args)
 
     def _uvicorn_run(self, config_file: Path | None, args: argparse.Namespace) -> None:
         if not config_file:
             self.parser.error("Config file is required")
 
-        # Set environment variables if provided
-        if args.env:
-            for env_pair in args.env:
-                try:
-                    key, value = validate_env_pair(env_pair)
-                    logger.info(f"Setting environment variable {key} => {value}")
-                    os.environ[key] = value
-                except ValueError as e:
-                    logger.error(f"Error: {str(e)}")
-                    self.parser.error(f"Invalid environment variable format: {env_pair}")
-
         config_file = resolve_config_or_distro(str(config_file), Mode.RUN)
         with open(config_file) as fp:
             config_contents = yaml.safe_load(fp)
diff --git a/llama_stack/core/stack.py b/llama_stack/core/stack.py
index d5d55319a..acc02eeff 100644
--- a/llama_stack/core/stack.py
+++ b/llama_stack/core/stack.py
@@ -274,22 +274,6 @@ def cast_image_name_to_string(config_dict: dict[str, Any]) -> dict[str, Any]:
     return config_dict
 
 
-def validate_env_pair(env_pair: str) -> tuple[str, str]:
-    """Validate and split an environment variable key-value pair."""
-    try:
-        key, value = env_pair.split("=", 1)
-        key = key.strip()
-        if not key:
-            raise ValueError(f"Empty key in environment variable pair: {env_pair}")
-        if not all(c.isalnum() or c == "_" for c in key):
-            raise ValueError(f"Key must contain only alphanumeric characters and underscores: {key}")
-        return key, value
-    except ValueError as e:
-        raise ValueError(
-            f"Invalid environment variable format '{env_pair}': {str(e)}. Expected format: KEY=value"
-        ) from e
-
-
 def add_internal_implementations(impls: dict[Api, Any], run_config: StackRunConfig) -> None:
     """Add internal implementations (inspect and providers) to the implementations dictionary.
 
diff --git a/llama_stack/core/start_stack.sh b/llama_stack/core/start_stack.sh
index 02b1cd408..cc0ae68d8 100755
--- a/llama_stack/core/start_stack.sh
+++ b/llama_stack/core/start_stack.sh
@@ -25,7 +25,7 @@ error_handler() {
 trap 'error_handler ${LINENO}' ERR
 
 if [ $# -lt 3 ]; then
-  echo "Usage: $0 <env_type> <env_path_or_name> <port> [--config <yaml_config>] [--env KEY=VALUE]..."
+  echo "Usage: $0 <env_type> <env_path_or_name> <port> [--config <yaml_config>]"
   exit 1
 fi
 
@@ -43,7 +43,6 @@ SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
 
 # Initialize variables
 yaml_config=""
-env_vars=""
 other_args=""
 
 # Process remaining arguments
@@ -58,15 +57,6 @@ while [[ $# -gt 0 ]]; do
         exit 1
       fi
       ;;
-    --env)
-      if [[ -n "$2" ]]; then
-        env_vars="$env_vars --env $2"
-        shift 2
-      else
-        echo -e "${RED}Error: --env requires a KEY=VALUE argument${NC}" >&2
-        exit 1
-      fi
-      ;;
     *)
       other_args="$other_args $1"
       shift
@@ -119,7 +109,6 @@ if [[ "$env_type" == "venv" ]]; then
     llama stack run \
     $yaml_config_arg \
     --port "$port" \
-    $env_vars \
     $other_args
 elif [[ "$env_type" == "container" ]]; then
     echo -e "${RED}Warning: Llama Stack no longer supports running Containers via the 'llama stack run' command.${NC}"
diff --git a/llama_stack/distributions/dell/doc_template.md b/llama_stack/distributions/dell/doc_template.md
index fcec3ea14..852e78d0e 100644
--- a/llama_stack/distributions/dell/doc_template.md
+++ b/llama_stack/distributions/dell/doc_template.md
@@ -117,11 +117,11 @@ docker run -it \
   # NOTE: mount the llama-stack directory if testing local changes else not needed
   -v $HOME/git/llama-stack:/app/llama-stack-source \
   # localhost/distribution-dell:dev if building / testing locally
+  -e INFERENCE_MODEL=$INFERENCE_MODEL \
+  -e DEH_URL=$DEH_URL \
+  -e CHROMA_URL=$CHROMA_URL \
   llamastack/distribution-{{ name }}\
-  --port $LLAMA_STACK_PORT  \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env DEH_URL=$DEH_URL \
-  --env CHROMA_URL=$CHROMA_URL
+  --port $LLAMA_STACK_PORT
 
 ```
 
@@ -142,14 +142,14 @@ docker run \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v $HOME/.llama:/root/.llama \
   -v ./llama_stack/distributions/tgi/run-with-safety.yaml:/root/my-run.yaml \
+  -e INFERENCE_MODEL=$INFERENCE_MODEL \
+  -e DEH_URL=$DEH_URL \
+  -e SAFETY_MODEL=$SAFETY_MODEL \
+  -e DEH_SAFETY_URL=$DEH_SAFETY_URL \
+  -e CHROMA_URL=$CHROMA_URL \
   llamastack/distribution-{{ name }} \
   --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env DEH_URL=$DEH_URL \
-  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env DEH_SAFETY_URL=$DEH_SAFETY_URL \
-  --env CHROMA_URL=$CHROMA_URL
+  --port $LLAMA_STACK_PORT
 ```
 
 ### Via Conda
@@ -158,21 +158,21 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
 
 ```bash
 llama stack build --distro {{ name }} --image-type conda
-llama stack run {{ name }}
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env DEH_URL=$DEH_URL \
-  --env CHROMA_URL=$CHROMA_URL
+INFERENCE_MODEL=$INFERENCE_MODEL \
+DEH_URL=$DEH_URL \
+CHROMA_URL=$CHROMA_URL \
+llama stack run {{ name }} \
+  --port $LLAMA_STACK_PORT
 ```
 
 If you are using Llama Stack Safety / Shield APIs, use:
 
 ```bash
+INFERENCE_MODEL=$INFERENCE_MODEL \
+DEH_URL=$DEH_URL \
+SAFETY_MODEL=$SAFETY_MODEL \
+DEH_SAFETY_URL=$DEH_SAFETY_URL \
+CHROMA_URL=$CHROMA_URL \
 llama stack run ./run-with-safety.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env DEH_URL=$DEH_URL \
-  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env DEH_SAFETY_URL=$DEH_SAFETY_URL \
-  --env CHROMA_URL=$CHROMA_URL
+  --port $LLAMA_STACK_PORT
 ```
diff --git a/llama_stack/distributions/meta-reference-gpu/doc_template.md b/llama_stack/distributions/meta-reference-gpu/doc_template.md
index 602d053c4..92dcc6102 100644
--- a/llama_stack/distributions/meta-reference-gpu/doc_template.md
+++ b/llama_stack/distributions/meta-reference-gpu/doc_template.md
@@ -72,9 +72,9 @@ docker run \
   --gpu all \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ~/.llama:/root/.llama \
+  -e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
   llamastack/distribution-{{ name }} \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --port $LLAMA_STACK_PORT
 ```
 
 If you are using Llama Stack Safety / Shield APIs, use:
@@ -86,10 +86,10 @@ docker run \
   --gpu all \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ~/.llama:/root/.llama \
+  -e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  -e SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
   llamastack/distribution-{{ name }} \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --port $LLAMA_STACK_PORT
 ```
 
 ### Via venv
@@ -98,16 +98,16 @@ Make sure you have done `uv pip install llama-stack` and have the Llama Stack CL
 
 ```bash
 llama stack build --distro {{ name }} --image-type venv
+INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
 llama stack run distributions/{{ name }}/run.yaml \
-  --port 8321 \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --port 8321
 ```
 
 If you are using Llama Stack Safety / Shield APIs, use:
 
 ```bash
+INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
 llama stack run distributions/{{ name }}/run-with-safety.yaml \
-  --port 8321 \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --port 8321
 ```
diff --git a/llama_stack/distributions/nvidia/doc_template.md b/llama_stack/distributions/nvidia/doc_template.md
index fbee17ef8..df2b68ef7 100644
--- a/llama_stack/distributions/nvidia/doc_template.md
+++ b/llama_stack/distributions/nvidia/doc_template.md
@@ -118,10 +118,10 @@ docker run \
   --pull always \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./run.yaml:/root/my-run.yaml \
+  -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
   llamastack/distribution-{{ name }} \
   --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
+  --port $LLAMA_STACK_PORT
 ```
 
 ### Via venv
@@ -131,10 +131,10 @@ If you've set up your local development environment, you can also build the imag
 ```bash
 INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
 llama stack build --distro nvidia --image-type venv
+NVIDIA_API_KEY=$NVIDIA_API_KEY \
+INFERENCE_MODEL=$INFERENCE_MODEL \
 llama stack run ./run.yaml \
-  --port 8321 \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
+  --port 8321
 ```
 
 ## Example Notebooks
diff --git a/scripts/install.sh b/scripts/install.sh
index f6fbc259c..571468dc5 100755
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -221,8 +221,8 @@ fi
 cmd=( run -d "${PLATFORM_OPTS[@]}" --name llama-stack \
       --network llama-net \
       -p "${PORT}:${PORT}" \
-      "${SERVER_IMAGE}" --port "${PORT}" \
-      --env OLLAMA_URL="http://ollama-server:${OLLAMA_PORT}")
+      -e OLLAMA_URL="http://ollama-server:${OLLAMA_PORT}" \
+      "${SERVER_IMAGE}" --port "${PORT}")
 
 log "🦙 Starting Llama Stack..."
 if ! execute_with_log $ENGINE "${cmd[@]}"; then

From 8cb14eb84c66ceaedf276918e75afa12db2d23bc Mon Sep 17 00:00:00 2001
From: Eric Huang <erichuang@meta.com>
Date: Mon, 6 Oct 2025 14:42:34 -0700
Subject: [PATCH 33/33] chore: require valid logging category

# What does this PR do?


## Test Plan
---
 llama_stack/core/conversations/conversations.py |  2 +-
 llama_stack/log.py                              | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/llama_stack/core/conversations/conversations.py b/llama_stack/core/conversations/conversations.py
index bef138e69..612b2f68e 100644
--- a/llama_stack/core/conversations/conversations.py
+++ b/llama_stack/core/conversations/conversations.py
@@ -32,7 +32,7 @@ from llama_stack.providers.utils.sqlstore.sqlstore import (
     sqlstore_impl,
 )
 
-logger = get_logger(name=__name__, category="openai::conversations")
+logger = get_logger(name=__name__, category="openai_conversations")
 
 
 class ConversationServiceConfig(BaseModel):
diff --git a/llama_stack/log.py b/llama_stack/log.py
index 6f751b21d..6ccc200fc 100644
--- a/llama_stack/log.py
+++ b/llama_stack/log.py
@@ -31,12 +31,17 @@ CATEGORIES = [
     "client",
     "telemetry",
     "openai_responses",
+    "openai_conversations",
     "testing",
     "providers",
     "models",
     "files",
     "vector_io",
     "tool_runtime",
+    "cli",
+    "post_training",
+    "scoring",
+    "tests",
 ]
 UNCATEGORIZED = "uncategorized"
 
@@ -261,11 +266,10 @@ def get_logger(
         if root_category in _category_levels:
             log_level = _category_levels[root_category]
         else:
+            assert category == UNCATEGORIZED, (
+                "Unknown logging category: {category}. To resolve, choose a valid category from the CATEGORIES list or add it to the CATEGORIES list."
+            )
             log_level = _category_levels.get("root", DEFAULT_LOG_LEVEL)
-            if category != UNCATEGORIZED:
-                logging.warning(
-                    f"Unknown logging category: {category}. Falling back to default 'root' level: {log_level}"
-                )
     logger.setLevel(log_level)
     return logging.LoggerAdapter(logger, {"category": category})