From 2e4eedce14132e5ed125766404fa05f973ab84a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Fri, 11 Jul 2025 16:25:33 +0200
Subject: [PATCH 1/8] fix: container build on podman (#2723)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

COPY with chmod does not work, see
https://github.com/containers/buildah/issues/4614. Also Docker arguably
implements it.

Anyway, this command is not even needed since later don't we do:

```
RUN mkdir -p /.llama /.cache && chmod -R g+rw /app /.llama /.cache
```

And providers.d will get the right modes.

<!-- Provide a short summary of what this PR does and why. Link to
relevant issues if applicable. -->

<!-- If resolving an issue, uncomment and update the line below -->
<!-- Closes #[issue-number] -->

## Test Plan

Build with CONTAINER_BINARY=podman and success

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 llama_stack/distribution/build_container.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh
index d9a918fb5..6e794b36f 100755
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@@ -96,7 +96,7 @@ FROM $container_base
 WORKDIR /app
 
 # We install the Python 3.12 dev headers and build tools so that any
-# C‑extension wheels (e.g. polyleven, faiss‑cpu) can compile successfully.
+# C-extension wheels (e.g. polyleven, faiss-cpu) can compile successfully.
 
 RUN dnf -y update && dnf install -y iputils git net-tools wget \
     vim-minimal python3.12 python3.12-pip python3.12-wheel \
@@ -169,7 +169,7 @@ if [ -n "$run_config" ]; then
     echo "Copying external providers directory: $external_providers_dir"
     cp -r "$external_providers_dir" "$BUILD_CONTEXT_DIR/providers.d"
     add_to_container << EOF
-COPY --chmod=g+w providers.d /.llama/providers.d
+COPY providers.d /.llama/providers.d
 EOF
     fi
 

From 2ebc172f339916d450139325c30808f675f022fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Fri, 11 Jul 2025 16:25:51 +0200
Subject: [PATCH 2/8] fix: pin opentelemtry version (#2722)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Otherwise we can get old versions like 1.11 and experience this error:

```
ModuleNotFoundError: No module named 'opentelemetry.exporter.otlp.proto.http.metric_exporter'
```

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 pyproject.toml | 4 ++--
 uv.lock        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d84a823a3..89ae4bc23 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,8 +42,8 @@ dependencies = [
     "h11>=0.16.0",
     "python-multipart>=0.0.20",               # For fastapi Form
     "uvicorn>=0.34.0",                        # server
-    "opentelemetry-sdk",                      # server
-    "opentelemetry-exporter-otlp-proto-http", # server
+    "opentelemetry-sdk>=1.30.0",              # server
+    "opentelemetry-exporter-otlp-proto-http>=1.30.0", # server
     "aiosqlite>=0.21.0",                      # server - for metadata store
     "asyncpg",                                # for metadata store
 ]
diff --git a/uv.lock b/uv.lock
index e77fb89f5..8374fe38a 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1365,8 +1365,8 @@ requires-dist = [
     { name = "llama-stack-client", specifier = ">=0.2.14" },
     { name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.14" },
     { name = "openai", specifier = ">=1.66" },
-    { name = "opentelemetry-exporter-otlp-proto-http" },
-    { name = "opentelemetry-sdk" },
+    { name = "opentelemetry-exporter-otlp-proto-http", specifier = ">=1.30.0" },
+    { name = "opentelemetry-sdk", specifier = ">=1.30.0" },
     { name = "pandas", marker = "extra == 'ui'" },
     { name = "pillow" },
     { name = "prompt-toolkit" },

From 30b2e6a495361faa6e52e6255780a69e92254f58 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Fri, 11 Jul 2025 16:00:24 -0400
Subject: [PATCH 3/8] chore: default to pytest asyncio-mode=auto (#2730)

# What does this PR do?

previously, developers who ran `./scripts/unit-tests.sh` would get
`asyncio-mode=auto`, which meant `@pytest.mark.asyncio` and
`@pytest_asyncio.fixture` were redundent. developers who ran `pytest`
directly would get pytest's default (strict mode), would run into errors
leading them to add `@pytest.mark.asyncio` / `@pytest_asyncio.fixture`
to their code.

with this change -
- `asyncio_mode=auto` is included in `pyproject.toml` making behavior
consistent for all invocations of pytest
- removes all redundant `@pytest_asyncio.fixture` and
`@pytest.mark.asyncio`
 - for good measure, requires `pytest>=8.4` and `pytest-asyncio>=1.0`

## Test Plan

- `./scripts/unit-tests.sh`
- `uv run pytest tests/unit`
---
 pyproject.toml                                |  7 ++++--
 scripts/unit-tests.sh                         |  2 +-
 tests/integration/agents/test_persistence.py  |  2 --
 tests/integration/inspect/test_inspect.py     |  3 ---
 tests/integration/providers/test_providers.py |  2 --
 .../sqlstore/test_authorized_sqlstore.py      |  2 --
 .../routers/test_routing_tables.py            |  8 -------
 tests/unit/distribution/test_context.py       |  3 ---
 tests/unit/files/test_files.py                | 22 +------------------
 tests/unit/fixtures.py                        |  8 +++----
 .../agent/test_meta_reference_agent.py        |  9 +-------
 .../meta_reference/test_openai_responses.py   | 15 -------------
 .../agents/test_persistence_access_control.py |  7 +-----
 .../providers/inference/test_remote_vllm.py   | 16 +-------------
 .../utils/inference/test_openai_compat.py     |  6 -----
 .../utils/memory/test_vector_store.py         |  7 ------
 .../providers/utils/test_model_registry.py    | 10 ---------
 tests/unit/providers/utils/test_scheduler.py  |  3 ---
 tests/unit/providers/vector_io/test_faiss.py  |  8 ++-----
 tests/unit/providers/vector_io/test_qdrant.py |  5 +----
 .../providers/vector_io/test_sqlite_vec.py    | 18 +--------------
 .../test_vector_io_openai_vector_stores.py    | 19 ----------------
 tests/unit/rag/test_rag_query.py              |  2 --
 tests/unit/rag/test_vector_store.py           |  7 ------
 tests/unit/registry/test_registry.py          |  8 -------
 tests/unit/registry/test_registry_acl.py      |  5 -----
 tests/unit/server/test_access_control.py      | 11 ++--------
 tests/unit/server/test_auth.py                |  1 -
 tests/unit/server/test_resolver.py            |  2 --
 tests/unit/server/test_sse.py                 |  7 ------
 .../utils/inference/test_inference_store.py   |  5 -----
 .../utils/responses/test_responses_store.py   |  8 -------
 tests/unit/utils/sqlstore/test_sqlstore.py    |  8 -------
 tests/unit/utils/test_authorized_sqlstore.py  |  5 -----
 uv.lock                                       | 17 +++++++-------
 35 files changed, 29 insertions(+), 239 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 89ae4bc23..f4115d028 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,9 +58,9 @@ ui = [
 
 [dependency-groups]
 dev = [
-    "pytest",
+    "pytest>=8.4",
     "pytest-timeout",
-    "pytest-asyncio",
+    "pytest-asyncio>=1.0",
     "pytest-cov",
     "pytest-html",
     "pytest-json-report",
@@ -339,3 +339,6 @@ warn_required_dynamic_aliases = true
 
 [tool.ruff.lint.pep8-naming]
 classmethod-decorators = ["classmethod", "pydantic.field_validator"]
+
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
diff --git a/scripts/unit-tests.sh b/scripts/unit-tests.sh
index 1fc3ff7fc..68d6458fc 100755
--- a/scripts/unit-tests.sh
+++ b/scripts/unit-tests.sh
@@ -16,4 +16,4 @@ if [ $FOUND_PYTHON -ne 0 ]; then
      uv python install "$PYTHON_VERSION"
 fi
 
-uv run --python "$PYTHON_VERSION" --with-editable . --group unit pytest --asyncio-mode=auto -s -v tests/unit/ $@
+uv run --python "$PYTHON_VERSION" --with-editable . --group unit pytest -s -v tests/unit/ $@
diff --git a/tests/integration/agents/test_persistence.py b/tests/integration/agents/test_persistence.py
index ef35c97a5..49d9d42d0 100644
--- a/tests/integration/agents/test_persistence.py
+++ b/tests/integration/agents/test_persistence.py
@@ -44,7 +44,6 @@ def common_params(inference_model):
     )
 
 
-@pytest.mark.asyncio
 @pytest.mark.skip(reason="This test needs to be migrated to api / client-sdk world")
 async def test_delete_agents_and_sessions(self, agents_stack, common_params):
     agents_impl = agents_stack.impls[Api.agents]
@@ -73,7 +72,6 @@ async def test_delete_agents_and_sessions(self, agents_stack, common_params):
     assert agent_response is None
 
 
-@pytest.mark.asyncio
 @pytest.mark.skip(reason="This test needs to be migrated to api / client-sdk world")
 async def test_get_agent_turns_and_steps(self, agents_stack, sample_messages, common_params):
     agents_impl = agents_stack.impls[Api.agents]
diff --git a/tests/integration/inspect/test_inspect.py b/tests/integration/inspect/test_inspect.py
index da704178d..1597a319b 100644
--- a/tests/integration/inspect/test_inspect.py
+++ b/tests/integration/inspect/test_inspect.py
@@ -4,20 +4,17 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import pytest
 from llama_stack_client import LlamaStackClient
 
 from llama_stack import LlamaStackAsLibraryClient
 
 
 class TestInspect:
-    @pytest.mark.asyncio
     def test_health(self, llama_stack_client: LlamaStackAsLibraryClient | LlamaStackClient):
         health = llama_stack_client.inspect.health()
         assert health is not None
         assert health.status == "OK"
 
-    @pytest.mark.asyncio
     def test_version(self, llama_stack_client: LlamaStackAsLibraryClient | LlamaStackClient):
         version = llama_stack_client.inspect.version()
         assert version is not None
diff --git a/tests/integration/providers/test_providers.py b/tests/integration/providers/test_providers.py
index 8b153411c..fc65e2a10 100644
--- a/tests/integration/providers/test_providers.py
+++ b/tests/integration/providers/test_providers.py
@@ -4,14 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import pytest
 from llama_stack_client import LlamaStackClient
 
 from llama_stack import LlamaStackAsLibraryClient
 
 
 class TestProviders:
-    @pytest.mark.asyncio
     def test_providers(self, llama_stack_client: LlamaStackAsLibraryClient | LlamaStackClient):
         provider_list = llama_stack_client.providers.list()
         assert provider_list is not None
diff --git a/tests/integration/providers/utils/sqlstore/test_authorized_sqlstore.py b/tests/integration/providers/utils/sqlstore/test_authorized_sqlstore.py
index bf6077532..c32d6cd17 100644
--- a/tests/integration/providers/utils/sqlstore/test_authorized_sqlstore.py
+++ b/tests/integration/providers/utils/sqlstore/test_authorized_sqlstore.py
@@ -88,7 +88,6 @@ async def cleanup_records(sql_store, table_name, record_ids):
             pass
 
 
-@pytest.mark.asyncio
 @pytest.mark.parametrize("backend_config", BACKEND_CONFIGS)
 @patch("llama_stack.providers.utils.sqlstore.authorized_sqlstore.get_authenticated_user")
 async def test_authorized_store_attributes(mock_get_authenticated_user, authorized_store, request):
@@ -183,7 +182,6 @@ async def test_authorized_store_attributes(mock_get_authenticated_user, authoriz
         await cleanup_records(authorized_store.sql_store, table_name, ["1", "2", "3", "4", "5", "6"])
 
 
-@pytest.mark.asyncio
 @pytest.mark.parametrize("backend_config", BACKEND_CONFIGS)
 @patch("llama_stack.providers.utils.sqlstore.authorized_sqlstore.get_authenticated_user")
 async def test_user_ownership_policy(mock_get_authenticated_user, authorized_store, request):
diff --git a/tests/unit/distribution/routers/test_routing_tables.py b/tests/unit/distribution/routers/test_routing_tables.py
index 0eeb68167..3ba042bd9 100644
--- a/tests/unit/distribution/routers/test_routing_tables.py
+++ b/tests/unit/distribution/routers/test_routing_tables.py
@@ -8,8 +8,6 @@
 
 from unittest.mock import AsyncMock
 
-import pytest
-
 from llama_stack.apis.common.type_system import NumberType
 from llama_stack.apis.datasets.datasets import Dataset, DatasetPurpose, URIDataSource
 from llama_stack.apis.datatypes import Api
@@ -119,7 +117,6 @@ class ToolGroupsImpl(Impl):
         )
 
 
-@pytest.mark.asyncio
 async def test_models_routing_table(cached_disk_dist_registry):
     table = ModelsRoutingTable({"test_provider": InferenceImpl()}, cached_disk_dist_registry, {})
     await table.initialize()
@@ -161,7 +158,6 @@ async def test_models_routing_table(cached_disk_dist_registry):
     assert len(openai_models.data) == 0
 
 
-@pytest.mark.asyncio
 async def test_shields_routing_table(cached_disk_dist_registry):
     table = ShieldsRoutingTable({"test_provider": SafetyImpl()}, cached_disk_dist_registry, {})
     await table.initialize()
@@ -177,7 +173,6 @@ async def test_shields_routing_table(cached_disk_dist_registry):
     assert "test-shield-2" in shield_ids
 
 
-@pytest.mark.asyncio
 async def test_vectordbs_routing_table(cached_disk_dist_registry):
     table = VectorDBsRoutingTable({"test_provider": VectorDBImpl()}, cached_disk_dist_registry, {})
     await table.initialize()
@@ -233,7 +228,6 @@ async def test_datasets_routing_table(cached_disk_dist_registry):
     assert len(datasets.data) == 0
 
 
-@pytest.mark.asyncio
 async def test_scoring_functions_routing_table(cached_disk_dist_registry):
     table = ScoringFunctionsRoutingTable({"test_provider": ScoringFunctionsImpl()}, cached_disk_dist_registry, {})
     await table.initialize()
@@ -259,7 +253,6 @@ async def test_scoring_functions_routing_table(cached_disk_dist_registry):
     assert "test-scoring-fn-2" in scoring_fn_ids
 
 
-@pytest.mark.asyncio
 async def test_benchmarks_routing_table(cached_disk_dist_registry):
     table = BenchmarksRoutingTable({"test_provider": BenchmarksImpl()}, cached_disk_dist_registry, {})
     await table.initialize()
@@ -277,7 +270,6 @@ async def test_benchmarks_routing_table(cached_disk_dist_registry):
     assert "test-benchmark" in benchmark_ids
 
 
-@pytest.mark.asyncio
 async def test_tool_groups_routing_table(cached_disk_dist_registry):
     table = ToolGroupsRoutingTable({"test_provider": ToolGroupsImpl()}, cached_disk_dist_registry, {})
     await table.initialize()
diff --git a/tests/unit/distribution/test_context.py b/tests/unit/distribution/test_context.py
index 84944bfe8..7914be51d 100644
--- a/tests/unit/distribution/test_context.py
+++ b/tests/unit/distribution/test_context.py
@@ -13,7 +13,6 @@ import pytest
 from llama_stack.distribution.utils.context import preserve_contexts_async_generator
 
 
-@pytest.mark.asyncio
 async def test_preserve_contexts_with_exception():
     # Create context variable
     context_var = ContextVar("exception_var", default="initial")
@@ -41,7 +40,6 @@ async def test_preserve_contexts_with_exception():
     context_var.reset(token)
 
 
-@pytest.mark.asyncio
 async def test_preserve_contexts_empty_generator():
     # Create context variable
     context_var = ContextVar("empty_var", default="initial")
@@ -66,7 +64,6 @@ async def test_preserve_contexts_empty_generator():
     context_var.reset(token)
 
 
-@pytest.mark.asyncio
 async def test_preserve_contexts_across_event_loops():
     """
     Test that context variables are preserved across event loop boundaries with nested generators.
diff --git a/tests/unit/files/test_files.py b/tests/unit/files/test_files.py
index ef1dc9743..785077e91 100644
--- a/tests/unit/files/test_files.py
+++ b/tests/unit/files/test_files.py
@@ -6,7 +6,6 @@
 
 
 import pytest
-import pytest_asyncio
 
 from llama_stack.apis.common.responses import Order
 from llama_stack.apis.files import OpenAIFilePurpose
@@ -29,7 +28,7 @@ class MockUploadFile:
         return self.content
 
 
-@pytest_asyncio.fixture
+@pytest.fixture
 async def files_provider(tmp_path):
     """Create a files provider with temporary storage for testing."""
     storage_dir = tmp_path / "files"
@@ -68,7 +67,6 @@ def large_file():
 class TestOpenAIFilesAPI:
     """Test suite for OpenAI Files API endpoints."""
 
-    @pytest.mark.asyncio
     async def test_upload_file_success(self, files_provider, sample_text_file):
         """Test successful file upload."""
         # Upload file
@@ -82,7 +80,6 @@ class TestOpenAIFilesAPI:
         assert result.created_at > 0
         assert result.expires_at > result.created_at
 
-    @pytest.mark.asyncio
     async def test_upload_different_purposes(self, files_provider, sample_text_file):
         """Test uploading files with different purposes."""
         purposes = list(OpenAIFilePurpose)
@@ -93,7 +90,6 @@ class TestOpenAIFilesAPI:
             uploaded_files.append(result)
             assert result.purpose == purpose
 
-    @pytest.mark.asyncio
     async def test_upload_different_file_types(self, files_provider, sample_text_file, sample_json_file, large_file):
         """Test uploading different types and sizes of files."""
         files_to_test = [
@@ -107,7 +103,6 @@ class TestOpenAIFilesAPI:
             assert result.filename == expected_filename
             assert result.bytes == len(file_obj.content)
 
-    @pytest.mark.asyncio
     async def test_list_files_empty(self, files_provider):
         """Test listing files when no files exist."""
         result = await files_provider.openai_list_files()
@@ -117,7 +112,6 @@ class TestOpenAIFilesAPI:
         assert result.first_id == ""
         assert result.last_id == ""
 
-    @pytest.mark.asyncio
     async def test_list_files_with_content(self, files_provider, sample_text_file, sample_json_file):
         """Test listing files when files exist."""
         # Upload multiple files
@@ -132,7 +126,6 @@ class TestOpenAIFilesAPI:
         assert file1.id in file_ids
         assert file2.id in file_ids
 
-    @pytest.mark.asyncio
     async def test_list_files_with_purpose_filter(self, files_provider, sample_text_file):
         """Test listing files with purpose filtering."""
         # Upload file with specific purpose
@@ -146,7 +139,6 @@ class TestOpenAIFilesAPI:
         assert result.data[0].id == uploaded_file.id
         assert result.data[0].purpose == OpenAIFilePurpose.ASSISTANTS
 
-    @pytest.mark.asyncio
     async def test_list_files_with_limit(self, files_provider, sample_text_file):
         """Test listing files with limit parameter."""
         # Upload multiple files
@@ -157,7 +149,6 @@ class TestOpenAIFilesAPI:
         result = await files_provider.openai_list_files(limit=3)
         assert len(result.data) == 3
 
-    @pytest.mark.asyncio
     async def test_list_files_with_order(self, files_provider, sample_text_file):
         """Test listing files with different order."""
         # Upload multiple files
@@ -178,7 +169,6 @@ class TestOpenAIFilesAPI:
         # Oldest should be first
         assert result_asc.data[0].created_at <= result_asc.data[1].created_at <= result_asc.data[2].created_at
 
-    @pytest.mark.asyncio
     async def test_retrieve_file_success(self, files_provider, sample_text_file):
         """Test successful file retrieval."""
         # Upload file
@@ -197,13 +187,11 @@ class TestOpenAIFilesAPI:
         assert retrieved_file.created_at == uploaded_file.created_at
         assert retrieved_file.expires_at == uploaded_file.expires_at
 
-    @pytest.mark.asyncio
     async def test_retrieve_file_not_found(self, files_provider):
         """Test retrieving a non-existent file."""
         with pytest.raises(ValueError, match="File with id file-nonexistent not found"):
             await files_provider.openai_retrieve_file("file-nonexistent")
 
-    @pytest.mark.asyncio
     async def test_retrieve_file_content_success(self, files_provider, sample_text_file):
         """Test successful file content retrieval."""
         # Upload file
@@ -217,13 +205,11 @@ class TestOpenAIFilesAPI:
         # Verify content
         assert content.body == sample_text_file.content
 
-    @pytest.mark.asyncio
     async def test_retrieve_file_content_not_found(self, files_provider):
         """Test retrieving content of a non-existent file."""
         with pytest.raises(ValueError, match="File with id file-nonexistent not found"):
             await files_provider.openai_retrieve_file_content("file-nonexistent")
 
-    @pytest.mark.asyncio
     async def test_delete_file_success(self, files_provider, sample_text_file):
         """Test successful file deletion."""
         # Upload file
@@ -245,13 +231,11 @@ class TestOpenAIFilesAPI:
         with pytest.raises(ValueError, match=f"File with id {uploaded_file.id} not found"):
             await files_provider.openai_retrieve_file(uploaded_file.id)
 
-    @pytest.mark.asyncio
     async def test_delete_file_not_found(self, files_provider):
         """Test deleting a non-existent file."""
         with pytest.raises(ValueError, match="File with id file-nonexistent not found"):
             await files_provider.openai_delete_file("file-nonexistent")
 
-    @pytest.mark.asyncio
     async def test_file_persistence_across_operations(self, files_provider, sample_text_file):
         """Test that files persist correctly across multiple operations."""
         # Upload file
@@ -279,7 +263,6 @@ class TestOpenAIFilesAPI:
         files_list = await files_provider.openai_list_files()
         assert len(files_list.data) == 0
 
-    @pytest.mark.asyncio
     async def test_multiple_files_operations(self, files_provider, sample_text_file, sample_json_file):
         """Test operations with multiple files."""
         # Upload multiple files
@@ -302,7 +285,6 @@ class TestOpenAIFilesAPI:
         content = await files_provider.openai_retrieve_file_content(file2.id)
         assert content.body == sample_json_file.content
 
-    @pytest.mark.asyncio
     async def test_file_id_uniqueness(self, files_provider, sample_text_file):
         """Test that each uploaded file gets a unique ID."""
         file_ids = set()
@@ -316,7 +298,6 @@ class TestOpenAIFilesAPI:
             file_ids.add(uploaded_file.id)
             assert uploaded_file.id.startswith("file-")
 
-    @pytest.mark.asyncio
     async def test_file_no_filename_handling(self, files_provider):
         """Test handling files with no filename."""
         file_without_name = MockUploadFile(b"content", None)  # No filename
@@ -327,7 +308,6 @@ class TestOpenAIFilesAPI:
 
         assert uploaded_file.filename == "uploaded_file"  # Default filename
 
-    @pytest.mark.asyncio
     async def test_after_pagination_works(self, files_provider, sample_text_file):
         """Test that 'after' pagination works correctly."""
         # Upload multiple files to test pagination
diff --git a/tests/unit/fixtures.py b/tests/unit/fixtures.py
index 4e50c5e08..7174d2e78 100644
--- a/tests/unit/fixtures.py
+++ b/tests/unit/fixtures.py
@@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import pytest_asyncio
+import pytest
 
 from llama_stack.distribution.store.registry import CachedDiskDistributionRegistry, DiskDistributionRegistry
 from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
 from llama_stack.providers.utils.kvstore.sqlite import SqliteKVStoreImpl
 
 
-@pytest_asyncio.fixture(scope="function")
+@pytest.fixture(scope="function")
 async def sqlite_kvstore(tmp_path):
     db_path = tmp_path / "test_kv.db"
     kvstore_config = SqliteKVStoreConfig(db_path=db_path.as_posix())
@@ -20,14 +20,14 @@ async def sqlite_kvstore(tmp_path):
     yield kvstore
 
 
-@pytest_asyncio.fixture(scope="function")
+@pytest.fixture(scope="function")
 async def disk_dist_registry(sqlite_kvstore):
     registry = DiskDistributionRegistry(sqlite_kvstore)
     await registry.initialize()
     yield registry
 
 
-@pytest_asyncio.fixture(scope="function")
+@pytest.fixture(scope="function")
 async def cached_disk_dist_registry(sqlite_kvstore):
     registry = CachedDiskDistributionRegistry(sqlite_kvstore)
     await registry.initialize()
diff --git a/tests/unit/providers/agent/test_meta_reference_agent.py b/tests/unit/providers/agent/test_meta_reference_agent.py
index 7a7d52892..c06d9ab0e 100644
--- a/tests/unit/providers/agent/test_meta_reference_agent.py
+++ b/tests/unit/providers/agent/test_meta_reference_agent.py
@@ -8,7 +8,6 @@ from datetime import datetime
 from unittest.mock import AsyncMock
 
 import pytest
-import pytest_asyncio
 
 from llama_stack.apis.agents import (
     Agent,
@@ -50,7 +49,7 @@ def config(tmp_path):
     )
 
 
-@pytest_asyncio.fixture
+@pytest.fixture
 async def agents_impl(config, mock_apis):
     impl = MetaReferenceAgentsImpl(
         config,
@@ -117,7 +116,6 @@ def sample_agent_config():
     )
 
 
-@pytest.mark.asyncio
 async def test_create_agent(agents_impl, sample_agent_config):
     response = await agents_impl.create_agent(sample_agent_config)
 
@@ -132,7 +130,6 @@ async def test_create_agent(agents_impl, sample_agent_config):
     assert isinstance(agent_info.created_at, datetime)
 
 
-@pytest.mark.asyncio
 async def test_get_agent(agents_impl, sample_agent_config):
     create_response = await agents_impl.create_agent(sample_agent_config)
     agent_id = create_response.agent_id
@@ -146,7 +143,6 @@ async def test_get_agent(agents_impl, sample_agent_config):
     assert isinstance(agent.created_at, datetime)
 
 
-@pytest.mark.asyncio
 async def test_list_agents(agents_impl, sample_agent_config):
     agent1_response = await agents_impl.create_agent(sample_agent_config)
     agent2_response = await agents_impl.create_agent(sample_agent_config)
@@ -160,7 +156,6 @@ async def test_list_agents(agents_impl, sample_agent_config):
     assert agent2_response.agent_id in agent_ids
 
 
-@pytest.mark.asyncio
 @pytest.mark.parametrize("enable_session_persistence", [True, False])
 async def test_create_agent_session_persistence(agents_impl, sample_agent_config, enable_session_persistence):
     # Create an agent with specified persistence setting
@@ -188,7 +183,6 @@ async def test_create_agent_session_persistence(agents_impl, sample_agent_config
         await agents_impl.get_agents_session(agent_id, session_response.session_id)
 
 
-@pytest.mark.asyncio
 @pytest.mark.parametrize("enable_session_persistence", [True, False])
 async def test_list_agent_sessions_persistence(agents_impl, sample_agent_config, enable_session_persistence):
     # Create an agent with specified persistence setting
@@ -221,7 +215,6 @@ async def test_list_agent_sessions_persistence(agents_impl, sample_agent_config,
     assert session2.session_id in {s["session_id"] for s in sessions.data}
 
 
-@pytest.mark.asyncio
 async def test_delete_agent(agents_impl, sample_agent_config):
     # Create an agent
     response = await agents_impl.create_agent(sample_agent_config)
diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
index 0d1ef8eca..6485e3512 100644
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@@ -122,7 +122,6 @@ async def fake_stream(fixture: str = "simple_chat_completion.yaml"):
     )
 
 
-@pytest.mark.asyncio
 async def test_create_openai_response_with_string_input(openai_responses_impl, mock_inference_api):
     """Test creating an OpenAI response with a simple string input."""
     # Setup
@@ -155,7 +154,6 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
     assert result.output[0].content[0].text == "Dublin"
 
 
-@pytest.mark.asyncio
 async def test_create_openai_response_with_string_input_with_tools(openai_responses_impl, mock_inference_api):
     """Test creating an OpenAI response with a simple string input and tools."""
     # Setup
@@ -224,7 +222,6 @@ async def test_create_openai_response_with_string_input_with_tools(openai_respon
         assert result.output[1].content[0].annotations == []
 
 
-@pytest.mark.asyncio
 async def test_create_openai_response_with_tool_call_type_none(openai_responses_impl, mock_inference_api):
     """Test creating an OpenAI response with a tool call response that has a type of None."""
     # Setup
@@ -294,7 +291,6 @@ async def test_create_openai_response_with_tool_call_type_none(openai_responses_
     assert chunks[1].response.output[0].name == "get_weather"
 
 
-@pytest.mark.asyncio
 async def test_create_openai_response_with_multiple_messages(openai_responses_impl, mock_inference_api):
     """Test creating an OpenAI response with multiple messages."""
     # Setup
@@ -340,7 +336,6 @@ async def test_create_openai_response_with_multiple_messages(openai_responses_im
             assert isinstance(inference_messages[i], OpenAIDeveloperMessageParam)
 
 
-@pytest.mark.asyncio
 async def test_prepend_previous_response_none(openai_responses_impl):
     """Test prepending no previous response to a new response."""
 
@@ -348,7 +343,6 @@ async def test_prepend_previous_response_none(openai_responses_impl):
     assert input == "fake_input"
 
 
-@pytest.mark.asyncio
 async def test_prepend_previous_response_basic(openai_responses_impl, mock_responses_store):
     """Test prepending a basic previous response to a new response."""
 
@@ -388,7 +382,6 @@ async def test_prepend_previous_response_basic(openai_responses_impl, mock_respo
     assert input[2].content == "fake_input"
 
 
-@pytest.mark.asyncio
 async def test_prepend_previous_response_web_search(openai_responses_impl, mock_responses_store):
     """Test prepending a web search previous response to a new response."""
     input_item_message = OpenAIResponseMessage(
@@ -434,7 +427,6 @@ async def test_prepend_previous_response_web_search(openai_responses_impl, mock_
     assert input[3].content == "fake_input"
 
 
-@pytest.mark.asyncio
 async def test_create_openai_response_with_instructions(openai_responses_impl, mock_inference_api):
     # Setup
     input_text = "What is the capital of Ireland?"
@@ -463,7 +455,6 @@ async def test_create_openai_response_with_instructions(openai_responses_impl, m
     assert sent_messages[1].content == input_text
 
 
-@pytest.mark.asyncio
 async def test_create_openai_response_with_instructions_and_multiple_messages(
     openai_responses_impl, mock_inference_api
 ):
@@ -508,7 +499,6 @@ async def test_create_openai_response_with_instructions_and_multiple_messages(
     assert sent_messages[3].content == "Which is the largest?"
 
 
-@pytest.mark.asyncio
 async def test_create_openai_response_with_instructions_and_previous_response(
     openai_responses_impl, mock_responses_store, mock_inference_api
 ):
@@ -565,7 +555,6 @@ async def test_create_openai_response_with_instructions_and_previous_response(
     assert sent_messages[3].content == "Which is the largest?"
 
 
-@pytest.mark.asyncio
 async def test_list_openai_response_input_items_delegation(openai_responses_impl, mock_responses_store):
     """Test that list_openai_response_input_items properly delegates to responses_store with correct parameters."""
     # Setup
@@ -601,7 +590,6 @@ async def test_list_openai_response_input_items_delegation(openai_responses_impl
     assert result.data[0].id == "msg_123"
 
 
-@pytest.mark.asyncio
 async def test_responses_store_list_input_items_logic():
     """Test ResponsesStore list_response_input_items logic - mocks get_response_object to test actual ordering/limiting."""
 
@@ -680,7 +668,6 @@ async def test_responses_store_list_input_items_logic():
     assert len(result.data) == 0  # Should return no items
 
 
-@pytest.mark.asyncio
 async def test_store_response_uses_rehydrated_input_with_previous_response(
     openai_responses_impl, mock_responses_store, mock_inference_api
 ):
@@ -747,7 +734,6 @@ async def test_store_response_uses_rehydrated_input_with_previous_response(
     assert result.status == "completed"
 
 
-@pytest.mark.asyncio
 @pytest.mark.parametrize(
     "text_format, response_format",
     [
@@ -787,7 +773,6 @@ async def test_create_openai_response_with_text_format(
     assert first_call.kwargs["response_format"] == response_format
 
 
-@pytest.mark.asyncio
 async def test_create_openai_response_with_invalid_text_format(openai_responses_impl, mock_inference_api):
     """Test creating an OpenAI response with an invalid text format."""
     # Setup
diff --git a/tests/unit/providers/agents/test_persistence_access_control.py b/tests/unit/providers/agents/test_persistence_access_control.py
index 656d1e53c..26001fcf1 100644
--- a/tests/unit/providers/agents/test_persistence_access_control.py
+++ b/tests/unit/providers/agents/test_persistence_access_control.py
@@ -9,7 +9,6 @@ from datetime import datetime
 from unittest.mock import patch
 
 import pytest
-import pytest_asyncio
 
 from llama_stack.apis.agents import Turn
 from llama_stack.apis.inference import CompletionMessage, StopReason
@@ -17,13 +16,12 @@ from llama_stack.distribution.datatypes import User
 from llama_stack.providers.inline.agents.meta_reference.persistence import AgentPersistence, AgentSessionInfo
 
 
-@pytest_asyncio.fixture
+@pytest.fixture
 async def test_setup(sqlite_kvstore):
     agent_persistence = AgentPersistence(agent_id="test_agent", kvstore=sqlite_kvstore, policy={})
     yield agent_persistence
 
 
-@pytest.mark.asyncio
 @patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_authenticated_user")
 async def test_session_creation_with_access_attributes(mock_get_authenticated_user, test_setup):
     agent_persistence = test_setup
@@ -44,7 +42,6 @@ async def test_session_creation_with_access_attributes(mock_get_authenticated_us
     assert session_info.owner.attributes["teams"] == ["ai-team"]
 
 
-@pytest.mark.asyncio
 @patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_authenticated_user")
 async def test_session_access_control(mock_get_authenticated_user, test_setup):
     agent_persistence = test_setup
@@ -79,7 +76,6 @@ async def test_session_access_control(mock_get_authenticated_user, test_setup):
     assert retrieved_session is None
 
 
-@pytest.mark.asyncio
 @patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_authenticated_user")
 async def test_turn_access_control(mock_get_authenticated_user, test_setup):
     agent_persistence = test_setup
@@ -133,7 +129,6 @@ async def test_turn_access_control(mock_get_authenticated_user, test_setup):
         await agent_persistence.get_session_turns(session_id)
 
 
-@pytest.mark.asyncio
 @patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_authenticated_user")
 async def test_tool_call_and_infer_iters_access_control(mock_get_authenticated_user, test_setup):
     agent_persistence = test_setup
diff --git a/tests/unit/providers/inference/test_remote_vllm.py b/tests/unit/providers/inference/test_remote_vllm.py
index eaa9b40da..ca44cc95d 100644
--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@@ -14,7 +14,6 @@ from typing import Any
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
-import pytest_asyncio
 from openai.types.chat.chat_completion_chunk import (
     ChatCompletionChunk as OpenAIChatCompletionChunk,
 )
@@ -103,7 +102,7 @@ def mock_openai_models_list():
         yield mock_list
 
 
-@pytest_asyncio.fixture(scope="module")
+@pytest.fixture(scope="module")
 async def vllm_inference_adapter():
     config = VLLMInferenceAdapterConfig(url="http://mocked.localhost:12345")
     inference_adapter = VLLMInferenceAdapter(config)
@@ -112,7 +111,6 @@ async def vllm_inference_adapter():
     return inference_adapter
 
 
-@pytest.mark.asyncio
 async def test_register_model_checks_vllm(mock_openai_models_list, vllm_inference_adapter):
     async def mock_openai_models():
         yield OpenAIModel(id="foo", created=1, object="model", owned_by="test")
@@ -125,7 +123,6 @@ async def test_register_model_checks_vllm(mock_openai_models_list, vllm_inferenc
     mock_openai_models_list.assert_called()
 
 
-@pytest.mark.asyncio
 async def test_old_vllm_tool_choice(vllm_inference_adapter):
     """
     Test that we set tool_choice to none when no tools are in use
@@ -149,7 +146,6 @@ async def test_old_vllm_tool_choice(vllm_inference_adapter):
         assert request.tool_config.tool_choice == ToolChoice.none
 
 
-@pytest.mark.asyncio
 async def test_tool_call_response(vllm_inference_adapter):
     """Verify that tool call arguments from a CompletionMessage are correctly converted
     into the expected JSON format."""
@@ -192,7 +188,6 @@ async def test_tool_call_response(vllm_inference_adapter):
         ]
 
 
-@pytest.mark.asyncio
 async def test_tool_call_delta_empty_tool_call_buf():
     """
     Test that we don't generate extra chunks when processing a
@@ -222,7 +217,6 @@ async def test_tool_call_delta_empty_tool_call_buf():
     assert chunks[1].event.stop_reason == StopReason.end_of_turn
 
 
-@pytest.mark.asyncio
 async def test_tool_call_delta_streaming_arguments_dict():
     async def mock_stream():
         mock_chunk_1 = OpenAIChatCompletionChunk(
@@ -297,7 +291,6 @@ async def test_tool_call_delta_streaming_arguments_dict():
     assert chunks[2].event.event_type.value == "complete"
 
 
-@pytest.mark.asyncio
 async def test_multiple_tool_calls():
     async def mock_stream():
         mock_chunk_1 = OpenAIChatCompletionChunk(
@@ -376,7 +369,6 @@ async def test_multiple_tool_calls():
     assert chunks[3].event.event_type.value == "complete"
 
 
-@pytest.mark.asyncio
 async def test_process_vllm_chat_completion_stream_response_no_choices():
     """
     Test that we don't error out when vLLM returns no choices for a
@@ -453,7 +445,6 @@ def test_chat_completion_doesnt_block_event_loop(caplog):
     assert not asyncio_warnings
 
 
-@pytest.mark.asyncio
 async def test_get_params_empty_tools(vllm_inference_adapter):
     request = ChatCompletionRequest(
         tools=[],
@@ -464,7 +455,6 @@ async def test_get_params_empty_tools(vllm_inference_adapter):
     assert "tools" not in params
 
 
-@pytest.mark.asyncio
 async def test_process_vllm_chat_completion_stream_response_tool_call_args_last_chunk():
     """
     Tests the edge case where the model returns the arguments for the tool call in the same chunk that
@@ -543,7 +533,6 @@ async def test_process_vllm_chat_completion_stream_response_tool_call_args_last_
     assert chunks[-2].event.delta.tool_call.arguments == mock_tool_arguments
 
 
-@pytest.mark.asyncio
 async def test_process_vllm_chat_completion_stream_response_no_finish_reason():
     """
     Tests the edge case where the model requests a tool call and stays idle without explicitly providing the
@@ -596,7 +585,6 @@ async def test_process_vllm_chat_completion_stream_response_no_finish_reason():
     assert chunks[-2].event.delta.tool_call.arguments == mock_tool_arguments
 
 
-@pytest.mark.asyncio
 async def test_process_vllm_chat_completion_stream_response_tool_without_args():
     """
     Tests the edge case where no arguments are provided for the tool call.
@@ -645,7 +633,6 @@ async def test_process_vllm_chat_completion_stream_response_tool_without_args():
     assert chunks[-2].event.delta.tool_call.arguments == {}
 
 
-@pytest.mark.asyncio
 async def test_health_status_success(vllm_inference_adapter):
     """
     Test the health method of VLLM InferenceAdapter when the connection is successful.
@@ -679,7 +666,6 @@ async def test_health_status_success(vllm_inference_adapter):
         mock_models.list.assert_called_once()
 
 
-@pytest.mark.asyncio
 async def test_health_status_failure(vllm_inference_adapter):
     """
     Test the health method of VLLM InferenceAdapter when the connection fails.
diff --git a/tests/unit/providers/utils/inference/test_openai_compat.py b/tests/unit/providers/utils/inference/test_openai_compat.py
index 3598e4810..f57f6c9b3 100644
--- a/tests/unit/providers/utils/inference/test_openai_compat.py
+++ b/tests/unit/providers/utils/inference/test_openai_compat.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import pytest
 
 from llama_stack.apis.common.content_types import TextContentItem
 from llama_stack.apis.inference import (
@@ -23,7 +22,6 @@ from llama_stack.providers.utils.inference.openai_compat import (
 )
 
 
-@pytest.mark.asyncio
 async def test_convert_message_to_openai_dict():
     message = UserMessage(content=[TextContentItem(text="Hello, world!")], role="user")
     assert await convert_message_to_openai_dict(message) == {
@@ -33,7 +31,6 @@ async def test_convert_message_to_openai_dict():
 
 
 # Test convert_message_to_openai_dict with a tool call
-@pytest.mark.asyncio
 async def test_convert_message_to_openai_dict_with_tool_call():
     message = CompletionMessage(
         content="",
@@ -54,7 +51,6 @@ async def test_convert_message_to_openai_dict_with_tool_call():
     }
 
 
-@pytest.mark.asyncio
 async def test_convert_message_to_openai_dict_with_builtin_tool_call():
     message = CompletionMessage(
         content="",
@@ -80,7 +76,6 @@ async def test_convert_message_to_openai_dict_with_builtin_tool_call():
     }
 
 
-@pytest.mark.asyncio
 async def test_openai_messages_to_messages_with_content_str():
     openai_messages = [
         OpenAISystemMessageParam(content="system message"),
@@ -98,7 +93,6 @@ async def test_openai_messages_to_messages_with_content_str():
     assert llama_messages[2].content == "assistant message"
 
 
-@pytest.mark.asyncio
 async def test_openai_messages_to_messages_with_content_list():
     openai_messages = [
         OpenAISystemMessageParam(content=[OpenAIChatCompletionContentPartTextParam(text="system message")]),
diff --git a/tests/unit/providers/utils/memory/test_vector_store.py b/tests/unit/providers/utils/memory/test_vector_store.py
index 220c21994..90b229262 100644
--- a/tests/unit/providers/utils/memory/test_vector_store.py
+++ b/tests/unit/providers/utils/memory/test_vector_store.py
@@ -13,7 +13,6 @@ from llama_stack.apis.tools import RAGDocument
 from llama_stack.providers.utils.memory.vector_store import content_from_data_and_mime_type, content_from_doc
 
 
-@pytest.mark.asyncio
 async def test_content_from_doc_with_url():
     """Test extracting content from RAGDocument with URL content."""
     mock_url = URL(uri="https://example.com")
@@ -33,7 +32,6 @@ async def test_content_from_doc_with_url():
         mock_instance.get.assert_called_once_with(mock_url.uri)
 
 
-@pytest.mark.asyncio
 async def test_content_from_doc_with_pdf_url():
     """Test extracting content from RAGDocument with URL pointing to a PDF."""
     mock_url = URL(uri="https://example.com/document.pdf")
@@ -58,7 +56,6 @@ async def test_content_from_doc_with_pdf_url():
         mock_parse_pdf.assert_called_once_with(b"PDF binary data")
 
 
-@pytest.mark.asyncio
 async def test_content_from_doc_with_data_url():
     """Test extracting content from RAGDocument with data URL content."""
     data_url = "data:text/plain;base64,SGVsbG8gV29ybGQ="  # "Hello World" base64 encoded
@@ -74,7 +71,6 @@ async def test_content_from_doc_with_data_url():
         mock_content_from_data.assert_called_once_with(data_url)
 
 
-@pytest.mark.asyncio
 async def test_content_from_doc_with_string():
     """Test extracting content from RAGDocument with string content."""
     content_string = "This is plain text content"
@@ -85,7 +81,6 @@ async def test_content_from_doc_with_string():
     assert result == content_string
 
 
-@pytest.mark.asyncio
 async def test_content_from_doc_with_string_url():
     """Test extracting content from RAGDocument with string URL content."""
     url_string = "https://example.com"
@@ -105,7 +100,6 @@ async def test_content_from_doc_with_string_url():
         mock_instance.get.assert_called_once_with(url_string)
 
 
-@pytest.mark.asyncio
 async def test_content_from_doc_with_string_pdf_url():
     """Test extracting content from RAGDocument with string URL pointing to a PDF."""
     url_string = "https://example.com/document.pdf"
@@ -130,7 +124,6 @@ async def test_content_from_doc_with_string_pdf_url():
         mock_parse_pdf.assert_called_once_with(b"PDF binary data")
 
 
-@pytest.mark.asyncio
 async def test_content_from_doc_with_interleaved_content():
     """Test extracting content from RAGDocument with InterleavedContent (the new case added in the commit)."""
     interleaved_content = [TextContentItem(text="First item"), TextContentItem(text="Second item")]
diff --git a/tests/unit/providers/utils/test_model_registry.py b/tests/unit/providers/utils/test_model_registry.py
index 10fa1e075..e11f95d49 100644
--- a/tests/unit/providers/utils/test_model_registry.py
+++ b/tests/unit/providers/utils/test_model_registry.py
@@ -87,18 +87,15 @@ def helper(known_provider_model: ProviderModelEntry, known_provider_model2: Prov
     return ModelRegistryHelper([known_provider_model, known_provider_model2])
 
 
-@pytest.mark.asyncio
 async def test_lookup_unknown_model(helper: ModelRegistryHelper, unknown_model: Model) -> None:
     assert helper.get_provider_model_id(unknown_model.model_id) is None
 
 
-@pytest.mark.asyncio
 async def test_register_unknown_provider_model(helper: ModelRegistryHelper, unknown_model: Model) -> None:
     with pytest.raises(ValueError):
         await helper.register_model(unknown_model)
 
 
-@pytest.mark.asyncio
 async def test_register_model(helper: ModelRegistryHelper, known_model: Model) -> None:
     model = Model(
         provider_id=known_model.provider_id,
@@ -110,7 +107,6 @@ async def test_register_model(helper: ModelRegistryHelper, known_model: Model) -
     assert helper.get_provider_model_id(model.model_id) == model.provider_resource_id
 
 
-@pytest.mark.asyncio
 async def test_register_model_from_alias(helper: ModelRegistryHelper, known_model: Model) -> None:
     model = Model(
         provider_id=known_model.provider_id,
@@ -122,13 +118,11 @@ async def test_register_model_from_alias(helper: ModelRegistryHelper, known_mode
     assert helper.get_provider_model_id(model.model_id) == known_model.provider_resource_id
 
 
-@pytest.mark.asyncio
 async def test_register_model_existing(helper: ModelRegistryHelper, known_model: Model) -> None:
     await helper.register_model(known_model)
     assert helper.get_provider_model_id(known_model.model_id) == known_model.provider_resource_id
 
 
-@pytest.mark.asyncio
 async def test_register_model_existing_different(
     helper: ModelRegistryHelper, known_model: Model, known_model2: Model
 ) -> None:
@@ -137,7 +131,6 @@ async def test_register_model_existing_different(
         await helper.register_model(known_model)
 
 
-@pytest.mark.asyncio
 async def test_unregister_model(helper: ModelRegistryHelper, known_model: Model) -> None:
     await helper.register_model(known_model)  # duplicate entry
     assert helper.get_provider_model_id(known_model.model_id) == known_model.provider_model_id
@@ -145,18 +138,15 @@ async def test_unregister_model(helper: ModelRegistryHelper, known_model: Model)
     assert helper.get_provider_model_id(known_model.model_id) is None
 
 
-@pytest.mark.asyncio
 async def test_unregister_unknown_model(helper: ModelRegistryHelper, unknown_model: Model) -> None:
     with pytest.raises(ValueError):
         await helper.unregister_model(unknown_model.model_id)
 
 
-@pytest.mark.asyncio
 async def test_register_model_during_init(helper: ModelRegistryHelper, known_model: Model) -> None:
     assert helper.get_provider_model_id(known_model.provider_resource_id) == known_model.provider_model_id
 
 
-@pytest.mark.asyncio
 async def test_unregister_model_during_init(helper: ModelRegistryHelper, known_model: Model) -> None:
     assert helper.get_provider_model_id(known_model.provider_resource_id) == known_model.provider_model_id
     await helper.unregister_model(known_model.provider_resource_id)
diff --git a/tests/unit/providers/utils/test_scheduler.py b/tests/unit/providers/utils/test_scheduler.py
index 25b4935de..e5ee74bfa 100644
--- a/tests/unit/providers/utils/test_scheduler.py
+++ b/tests/unit/providers/utils/test_scheduler.py
@@ -11,7 +11,6 @@ import pytest
 from llama_stack.providers.utils.scheduler import JobStatus, Scheduler
 
 
-@pytest.mark.asyncio
 async def test_scheduler_unknown_backend():
     with pytest.raises(ValueError):
         Scheduler(backend="unknown")
@@ -26,7 +25,6 @@ async def wait_for_job_completed(sched: Scheduler, job_id: str) -> None:
     raise TimeoutError(f"Job {job_id} did not complete in time.")
 
 
-@pytest.mark.asyncio
 async def test_scheduler_naive():
     sched = Scheduler()
 
@@ -87,7 +85,6 @@ async def test_scheduler_naive():
     assert job.logs[0][0] < job.logs[1][0]
 
 
-@pytest.mark.asyncio
 async def test_scheduler_naive_handler_raises():
     sched = Scheduler()
 
diff --git a/tests/unit/providers/vector_io/test_faiss.py b/tests/unit/providers/vector_io/test_faiss.py
index 8348b84e3..90108d7a0 100644
--- a/tests/unit/providers/vector_io/test_faiss.py
+++ b/tests/unit/providers/vector_io/test_faiss.py
@@ -9,7 +9,6 @@ from unittest.mock import AsyncMock, MagicMock, patch
 
 import numpy as np
 import pytest
-import pytest_asyncio
 
 from llama_stack.apis.files import Files
 from llama_stack.apis.inference import EmbeddingsResponse, Inference
@@ -91,13 +90,13 @@ def faiss_config():
     return config
 
 
-@pytest_asyncio.fixture
+@pytest.fixture
 async def faiss_index(embedding_dimension):
     index = await FaissIndex.create(dimension=embedding_dimension)
     yield index
 
 
-@pytest_asyncio.fixture
+@pytest.fixture
 async def faiss_adapter(faiss_config, mock_inference_api, mock_files_api) -> FaissVectorIOAdapter:
     # Create the adapter
     adapter = FaissVectorIOAdapter(config=faiss_config, inference_api=mock_inference_api, files_api=mock_files_api)
@@ -113,7 +112,6 @@ async def faiss_adapter(faiss_config, mock_inference_api, mock_files_api) -> Fai
         yield adapter
 
 
-@pytest.mark.asyncio
 async def test_faiss_query_vector_returns_infinity_when_query_and_embedding_are_identical(
     faiss_index, sample_chunks, sample_embeddings, embedding_dimension
 ):
@@ -136,7 +134,6 @@ async def test_faiss_query_vector_returns_infinity_when_query_and_embedding_are_
         assert response.chunks[1] == sample_chunks[1]
 
 
-@pytest.mark.asyncio
 async def test_health_success():
     """Test that the health check returns OK status when faiss is working correctly."""
     # Create a fresh instance of FaissVectorIOAdapter for testing
@@ -160,7 +157,6 @@ async def test_health_success():
         mock_index_flat.assert_called_once_with(128)  # VECTOR_DIMENSION is 128
 
 
-@pytest.mark.asyncio
 async def test_health_failure():
     """Test that the health check returns ERROR status when faiss encounters an error."""
     # Create a fresh instance of FaissVectorIOAdapter for testing
diff --git a/tests/unit/providers/vector_io/test_qdrant.py b/tests/unit/providers/vector_io/test_qdrant.py
index 6902c8850..d3ffe711c 100644
--- a/tests/unit/providers/vector_io/test_qdrant.py
+++ b/tests/unit/providers/vector_io/test_qdrant.py
@@ -10,7 +10,6 @@ from typing import Any
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
-import pytest_asyncio
 
 from llama_stack.apis.inference import EmbeddingsResponse, Inference
 from llama_stack.apis.vector_io import (
@@ -68,7 +67,7 @@ def mock_api_service(sample_embeddings):
     return mock_api_service
 
 
-@pytest_asyncio.fixture
+@pytest.fixture
 async def qdrant_adapter(qdrant_config, mock_vector_db_store, mock_api_service, loop) -> QdrantVectorIOAdapter:
     adapter = QdrantVectorIOAdapter(config=qdrant_config, inference_api=mock_api_service)
     adapter.vector_db_store = mock_vector_db_store
@@ -80,7 +79,6 @@ async def qdrant_adapter(qdrant_config, mock_vector_db_store, mock_api_service,
 __QUERY = "Sample query"
 
 
-@pytest.mark.asyncio
 @pytest.mark.parametrize("max_query_chunks, expected_chunks", [(2, 2), (100, 60)])
 async def test_qdrant_adapter_returns_expected_chunks(
     qdrant_adapter: QdrantVectorIOAdapter,
@@ -111,7 +109,6 @@ def _prepare_for_json(value: Any) -> str:
 
 
 @patch("llama_stack.providers.utils.telemetry.trace_protocol._prepare_for_json", new=_prepare_for_json)
-@pytest.mark.asyncio
 async def test_qdrant_register_and_unregister_vector_db(
     qdrant_adapter: QdrantVectorIOAdapter,
     mock_vector_db,
diff --git a/tests/unit/providers/vector_io/test_sqlite_vec.py b/tests/unit/providers/vector_io/test_sqlite_vec.py
index 8579c31bb..a61eeeeca 100644
--- a/tests/unit/providers/vector_io/test_sqlite_vec.py
+++ b/tests/unit/providers/vector_io/test_sqlite_vec.py
@@ -8,7 +8,6 @@ import asyncio
 
 import numpy as np
 import pytest
-import pytest_asyncio
 
 from llama_stack.apis.vector_io import Chunk, QueryChunksResponse
 from llama_stack.providers.inline.vector_io.sqlite_vec.sqlite_vec import (
@@ -34,7 +33,7 @@ def loop():
     return asyncio.new_event_loop()
 
 
-@pytest_asyncio.fixture
+@pytest.fixture
 async def sqlite_vec_index(embedding_dimension, tmp_path_factory):
     temp_dir = tmp_path_factory.getbasetemp()
     db_path = str(temp_dir / "test_sqlite.db")
@@ -43,14 +42,12 @@ async def sqlite_vec_index(embedding_dimension, tmp_path_factory):
     await index.delete()
 
 
-@pytest.mark.asyncio
 async def test_query_chunk_metadata(sqlite_vec_index, sample_chunks_with_metadata, sample_embeddings_with_metadata):
     await sqlite_vec_index.add_chunks(sample_chunks_with_metadata, sample_embeddings_with_metadata)
     response = await sqlite_vec_index.query_vector(sample_embeddings_with_metadata[-1], k=2, score_threshold=0.0)
     assert response.chunks[0].chunk_metadata == sample_chunks_with_metadata[-1].chunk_metadata
 
 
-@pytest.mark.asyncio
 async def test_query_chunks_full_text_search(sqlite_vec_index, sample_chunks, sample_embeddings):
     await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
     query_string = "Sentence 5"
@@ -68,7 +65,6 @@ async def test_query_chunks_full_text_search(sqlite_vec_index, sample_chunks, sa
     assert len(response_no_results.chunks) == 0, f"Expected 0 results, but got {len(response_no_results.chunks)}"
 
 
-@pytest.mark.asyncio
 async def test_query_chunks_hybrid(sqlite_vec_index, sample_chunks, sample_embeddings):
     await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
 
@@ -90,7 +86,6 @@ async def test_query_chunks_hybrid(sqlite_vec_index, sample_chunks, sample_embed
     assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
 
 
-@pytest.mark.asyncio
 async def test_query_chunks_full_text_search_k_greater_than_results(sqlite_vec_index, sample_chunks, sample_embeddings):
     # Re-initialize with a clean index
     await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
@@ -103,7 +98,6 @@ async def test_query_chunks_full_text_search_k_greater_than_results(sqlite_vec_i
     assert any("Sentence 1 from document 0" in chunk.content for chunk in response.chunks), "Expected chunk not found"
 
 
-@pytest.mark.asyncio
 async def test_chunk_id_conflict(sqlite_vec_index, sample_chunks, embedding_dimension):
     """Test that chunk IDs do not conflict across batches when inserting chunks."""
     # Reduce batch size to force multiple batches for same document
@@ -134,7 +128,6 @@ async def sqlite_vec_adapter(sqlite_connection):
     await adapter.shutdown()
 
 
-@pytest.mark.asyncio
 async def test_query_chunks_hybrid_no_keyword_matches(sqlite_vec_index, sample_chunks, sample_embeddings):
     """Test hybrid search when keyword search returns no matches - should still return vector results."""
     await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
@@ -163,7 +156,6 @@ async def test_query_chunks_hybrid_no_keyword_matches(sqlite_vec_index, sample_c
     assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
 
 
-@pytest.mark.asyncio
 async def test_query_chunks_hybrid_score_threshold(sqlite_vec_index, sample_chunks, sample_embeddings):
     """Test hybrid search with a high score threshold."""
     await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
@@ -185,7 +177,6 @@ async def test_query_chunks_hybrid_score_threshold(sqlite_vec_index, sample_chun
     assert len(response.chunks) == 0
 
 
-@pytest.mark.asyncio
 async def test_query_chunks_hybrid_different_embedding(
     sqlite_vec_index, sample_chunks, sample_embeddings, embedding_dimension
 ):
@@ -211,7 +202,6 @@ async def test_query_chunks_hybrid_different_embedding(
     assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
 
 
-@pytest.mark.asyncio
 async def test_query_chunks_hybrid_rrf_ranking(sqlite_vec_index, sample_chunks, sample_embeddings):
     """Test that RRF properly combines rankings when documents appear in both search methods."""
     await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
@@ -236,7 +226,6 @@ async def test_query_chunks_hybrid_rrf_ranking(sqlite_vec_index, sample_chunks,
     assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
 
 
-@pytest.mark.asyncio
 async def test_query_chunks_hybrid_score_selection(sqlite_vec_index, sample_chunks, sample_embeddings):
     await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
 
@@ -284,7 +273,6 @@ async def test_query_chunks_hybrid_score_selection(sqlite_vec_index, sample_chun
     assert response.scores[0] == pytest.approx(2.0 / 61.0, rel=1e-6)  # Should behave like RRF
 
 
-@pytest.mark.asyncio
 async def test_query_chunks_hybrid_mixed_results(sqlite_vec_index, sample_chunks, sample_embeddings):
     """Test hybrid search with documents that appear in only one search method."""
     await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
@@ -313,7 +301,6 @@ async def test_query_chunks_hybrid_mixed_results(sqlite_vec_index, sample_chunks
     assert "document-2" in doc_ids  # From keyword search
 
 
-@pytest.mark.asyncio
 async def test_query_chunks_hybrid_weighted_reranker_parametrization(
     sqlite_vec_index, sample_chunks, sample_embeddings
 ):
@@ -369,7 +356,6 @@ async def test_query_chunks_hybrid_weighted_reranker_parametrization(
     )
 
 
-@pytest.mark.asyncio
 async def test_query_chunks_hybrid_rrf_impact_factor(sqlite_vec_index, sample_chunks, sample_embeddings):
     """Test RRFReRanker with different impact factors."""
     await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
@@ -401,7 +387,6 @@ async def test_query_chunks_hybrid_rrf_impact_factor(sqlite_vec_index, sample_ch
     assert response.scores[0] == pytest.approx(2.0 / 101.0, rel=1e-6)
 
 
-@pytest.mark.asyncio
 async def test_query_chunks_hybrid_edge_cases(sqlite_vec_index, sample_chunks, sample_embeddings):
     await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
 
@@ -445,7 +430,6 @@ async def test_query_chunks_hybrid_edge_cases(sqlite_vec_index, sample_chunks, s
     assert len(response.chunks) <= 100
 
 
-@pytest.mark.asyncio
 async def test_query_chunks_hybrid_tie_breaking(
     sqlite_vec_index, sample_embeddings, embedding_dimension, tmp_path_factory
 ):
diff --git a/tests/unit/providers/vector_io/test_vector_io_openai_vector_stores.py b/tests/unit/providers/vector_io/test_vector_io_openai_vector_stores.py
index 5f7926ce6..97e2f085e 100644
--- a/tests/unit/providers/vector_io/test_vector_io_openai_vector_stores.py
+++ b/tests/unit/providers/vector_io/test_vector_io_openai_vector_stores.py
@@ -25,12 +25,10 @@ from llama_stack.providers.remote.vector_io.milvus.milvus import VECTOR_DBS_PREF
 # -v -s --tb=short --disable-warnings --asyncio-mode=auto
 
 
-@pytest.mark.asyncio
 async def test_initialize_index(vector_index):
     await vector_index.initialize()
 
 
-@pytest.mark.asyncio
 async def test_add_chunks_query_vector(vector_index, sample_chunks, sample_embeddings):
     vector_index.delete()
     vector_index.initialize()
@@ -40,7 +38,6 @@ async def test_add_chunks_query_vector(vector_index, sample_chunks, sample_embed
     vector_index.delete()
 
 
-@pytest.mark.asyncio
 async def test_chunk_id_conflict(vector_index, sample_chunks, embedding_dimension):
     embeddings = np.random.rand(len(sample_chunks), embedding_dimension).astype(np.float32)
     await vector_index.add_chunks(sample_chunks, embeddings)
@@ -54,7 +51,6 @@ async def test_chunk_id_conflict(vector_index, sample_chunks, embedding_dimensio
     assert len(contents) == len(set(contents))
 
 
-@pytest.mark.asyncio
 async def test_initialize_adapter_with_existing_kvstore(vector_io_adapter):
     key = f"{VECTOR_DBS_PREFIX}db1"
     dummy = VectorDB(
@@ -65,7 +61,6 @@ async def test_initialize_adapter_with_existing_kvstore(vector_io_adapter):
     await vector_io_adapter.initialize()
 
 
-@pytest.mark.asyncio
 async def test_persistence_across_adapter_restarts(vector_io_adapter):
     await vector_io_adapter.initialize()
     dummy = VectorDB(
@@ -79,7 +74,6 @@ async def test_persistence_across_adapter_restarts(vector_io_adapter):
     await vector_io_adapter.shutdown()
 
 
-@pytest.mark.asyncio
 async def test_register_and_unregister_vector_db(vector_io_adapter):
     unique_id = f"foo_db_{np.random.randint(1e6)}"
     dummy = VectorDB(
@@ -92,14 +86,12 @@ async def test_register_and_unregister_vector_db(vector_io_adapter):
     assert dummy.identifier not in vector_io_adapter.cache
 
 
-@pytest.mark.asyncio
 async def test_query_unregistered_raises(vector_io_adapter):
     fake_emb = np.zeros(8, dtype=np.float32)
     with pytest.raises(ValueError):
         await vector_io_adapter.query_chunks("no_such_db", fake_emb)
 
 
-@pytest.mark.asyncio
 async def test_insert_chunks_calls_underlying_index(vector_io_adapter):
     fake_index = AsyncMock()
     vector_io_adapter._get_and_cache_vector_db_index = AsyncMock(return_value=fake_index)
@@ -110,7 +102,6 @@ async def test_insert_chunks_calls_underlying_index(vector_io_adapter):
     fake_index.insert_chunks.assert_awaited_once_with(chunks)
 
 
-@pytest.mark.asyncio
 async def test_insert_chunks_missing_db_raises(vector_io_adapter):
     vector_io_adapter._get_and_cache_vector_db_index = AsyncMock(return_value=None)
 
@@ -118,7 +109,6 @@ async def test_insert_chunks_missing_db_raises(vector_io_adapter):
         await vector_io_adapter.insert_chunks("db_not_exist", [])
 
 
-@pytest.mark.asyncio
 async def test_query_chunks_calls_underlying_index_and_returns(vector_io_adapter):
     expected = QueryChunksResponse(chunks=[Chunk(content="c1")], scores=[0.1])
     fake_index = AsyncMock(query_chunks=AsyncMock(return_value=expected))
@@ -130,7 +120,6 @@ async def test_query_chunks_calls_underlying_index_and_returns(vector_io_adapter
     assert response is expected
 
 
-@pytest.mark.asyncio
 async def test_query_chunks_missing_db_raises(vector_io_adapter):
     vector_io_adapter._get_and_cache_vector_db_index = AsyncMock(return_value=None)
 
@@ -138,7 +127,6 @@ async def test_query_chunks_missing_db_raises(vector_io_adapter):
         await vector_io_adapter.query_chunks("db_missing", "q", None)
 
 
-@pytest.mark.asyncio
 async def test_save_openai_vector_store(vector_io_adapter):
     store_id = "vs_1234"
     openai_vector_store = {
@@ -155,7 +143,6 @@ async def test_save_openai_vector_store(vector_io_adapter):
     assert vector_io_adapter.openai_vector_stores[openai_vector_store["id"]] == openai_vector_store
 
 
-@pytest.mark.asyncio
 async def test_update_openai_vector_store(vector_io_adapter):
     store_id = "vs_1234"
     openai_vector_store = {
@@ -172,7 +159,6 @@ async def test_update_openai_vector_store(vector_io_adapter):
     assert vector_io_adapter.openai_vector_stores[openai_vector_store["id"]] == openai_vector_store
 
 
-@pytest.mark.asyncio
 async def test_delete_openai_vector_store(vector_io_adapter):
     store_id = "vs_1234"
     openai_vector_store = {
@@ -188,7 +174,6 @@ async def test_delete_openai_vector_store(vector_io_adapter):
     assert openai_vector_store["id"] not in vector_io_adapter.openai_vector_stores
 
 
-@pytest.mark.asyncio
 async def test_load_openai_vector_stores(vector_io_adapter):
     store_id = "vs_1234"
     openai_vector_store = {
@@ -204,7 +189,6 @@ async def test_load_openai_vector_stores(vector_io_adapter):
     assert loaded_stores[store_id] == openai_vector_store
 
 
-@pytest.mark.asyncio
 async def test_save_openai_vector_store_file(vector_io_adapter, tmp_path_factory):
     store_id = "vs_1234"
     file_id = "file_1234"
@@ -226,7 +210,6 @@ async def test_save_openai_vector_store_file(vector_io_adapter, tmp_path_factory
     await vector_io_adapter._save_openai_vector_store_file(store_id, file_id, file_info, file_contents)
 
 
-@pytest.mark.asyncio
 async def test_update_openai_vector_store_file(vector_io_adapter, tmp_path_factory):
     store_id = "vs_1234"
     file_id = "file_1234"
@@ -260,7 +243,6 @@ async def test_update_openai_vector_store_file(vector_io_adapter, tmp_path_facto
     assert loaded_contents != file_info
 
 
-@pytest.mark.asyncio
 async def test_load_openai_vector_store_file_contents(vector_io_adapter, tmp_path_factory):
     store_id = "vs_1234"
     file_id = "file_1234"
@@ -284,7 +266,6 @@ async def test_load_openai_vector_store_file_contents(vector_io_adapter, tmp_pat
     assert loaded_contents == file_contents
 
 
-@pytest.mark.asyncio
 async def test_delete_openai_vector_store_file_from_storage(vector_io_adapter, tmp_path_factory):
     store_id = "vs_1234"
     file_id = "file_1234"
diff --git a/tests/unit/rag/test_rag_query.py b/tests/unit/rag/test_rag_query.py
index d2dd1783b..b2baa744a 100644
--- a/tests/unit/rag/test_rag_query.py
+++ b/tests/unit/rag/test_rag_query.py
@@ -17,13 +17,11 @@ from llama_stack.providers.inline.tool_runtime.rag.memory import MemoryToolRunti
 
 
 class TestRagQuery:
-    @pytest.mark.asyncio
     async def test_query_raises_on_empty_vector_db_ids(self):
         rag_tool = MemoryToolRuntimeImpl(config=MagicMock(), vector_io_api=MagicMock(), inference_api=MagicMock())
         with pytest.raises(ValueError):
             await rag_tool.query(content=MagicMock(), vector_db_ids=[])
 
-    @pytest.mark.asyncio
     async def test_query_chunk_metadata_handling(self):
         rag_tool = MemoryToolRuntimeImpl(config=MagicMock(), vector_io_api=MagicMock(), inference_api=MagicMock())
         content = "test query content"
diff --git a/tests/unit/rag/test_vector_store.py b/tests/unit/rag/test_vector_store.py
index 9d6b9ee67..dd36d3992 100644
--- a/tests/unit/rag/test_vector_store.py
+++ b/tests/unit/rag/test_vector_store.py
@@ -112,7 +112,6 @@ class TestValidateEmbedding:
 
 
 class TestVectorStore:
-    @pytest.mark.asyncio
     async def test_returns_content_from_pdf_data_uri(self):
         data_uri = data_url_from_file(DUMMY_PDF_PATH)
         doc = RAGDocument(
@@ -124,7 +123,6 @@ class TestVectorStore:
         content = await content_from_doc(doc)
         assert content in DUMMY_PDF_TEXT_CHOICES
 
-    @pytest.mark.asyncio
     async def test_downloads_pdf_and_returns_content(self):
         # Using GitHub to host the PDF file
         url = "https://raw.githubusercontent.com/meta-llama/llama-stack/da035d69cfca915318eaf485770a467ca3c2a238/llama_stack/providers/tests/memory/fixtures/dummy.pdf"
@@ -137,7 +135,6 @@ class TestVectorStore:
         content = await content_from_doc(doc)
         assert content in DUMMY_PDF_TEXT_CHOICES
 
-    @pytest.mark.asyncio
     async def test_downloads_pdf_and_returns_content_with_url_object(self):
         # Using GitHub to host the PDF file
         url = "https://raw.githubusercontent.com/meta-llama/llama-stack/da035d69cfca915318eaf485770a467ca3c2a238/llama_stack/providers/tests/memory/fixtures/dummy.pdf"
@@ -204,7 +201,6 @@ class TestVectorStore:
 
 
 class TestVectorDBWithIndex:
-    @pytest.mark.asyncio
     async def test_insert_chunks_without_embeddings(self):
         mock_vector_db = MagicMock()
         mock_vector_db.embedding_model = "test-model without embeddings"
@@ -230,7 +226,6 @@ class TestVectorDBWithIndex:
         assert args[0] == chunks
         assert np.array_equal(args[1], np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], dtype=np.float32))
 
-    @pytest.mark.asyncio
     async def test_insert_chunks_with_valid_embeddings(self):
         mock_vector_db = MagicMock()
         mock_vector_db.embedding_model = "test-model with embeddings"
@@ -255,7 +250,6 @@ class TestVectorDBWithIndex:
         assert args[0] == chunks
         assert np.array_equal(args[1], np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], dtype=np.float32))
 
-    @pytest.mark.asyncio
     async def test_insert_chunks_with_invalid_embeddings(self):
         mock_vector_db = MagicMock()
         mock_vector_db.embedding_dimension = 3
@@ -295,7 +289,6 @@ class TestVectorDBWithIndex:
         mock_inference_api.embeddings.assert_not_called()
         mock_index.add_chunks.assert_not_called()
 
-    @pytest.mark.asyncio
     async def test_insert_chunks_with_partially_precomputed_embeddings(self):
         mock_vector_db = MagicMock()
         mock_vector_db.embedding_model = "test-model with partial embeddings"
diff --git a/tests/unit/registry/test_registry.py b/tests/unit/registry/test_registry.py
index 909581bb7..87fe18d54 100644
--- a/tests/unit/registry/test_registry.py
+++ b/tests/unit/registry/test_registry.py
@@ -38,14 +38,12 @@ def sample_model():
     )
 
 
-@pytest.mark.asyncio
 async def test_registry_initialization(disk_dist_registry):
     # Test empty registry
     result = await disk_dist_registry.get("nonexistent", "nonexistent")
     assert result is None
 
 
-@pytest.mark.asyncio
 async def test_basic_registration(disk_dist_registry, sample_vector_db, sample_model):
     print(f"Registering {sample_vector_db}")
     await disk_dist_registry.register(sample_vector_db)
@@ -64,7 +62,6 @@ async def test_basic_registration(disk_dist_registry, sample_vector_db, sample_m
     assert result_model.provider_id == sample_model.provider_id
 
 
-@pytest.mark.asyncio
 async def test_cached_registry_initialization(sqlite_kvstore, sample_vector_db, sample_model):
     # First populate the disk registry
     disk_registry = DiskDistributionRegistry(sqlite_kvstore)
@@ -85,7 +82,6 @@ async def test_cached_registry_initialization(sqlite_kvstore, sample_vector_db,
     assert result_vector_db.provider_id == sample_vector_db.provider_id
 
 
-@pytest.mark.asyncio
 async def test_cached_registry_updates(cached_disk_dist_registry):
     new_vector_db = VectorDB(
         identifier="test_vector_db_2",
@@ -112,7 +108,6 @@ async def test_cached_registry_updates(cached_disk_dist_registry):
     assert result_vector_db.provider_id == new_vector_db.provider_id
 
 
-@pytest.mark.asyncio
 async def test_duplicate_provider_registration(cached_disk_dist_registry):
     original_vector_db = VectorDB(
         identifier="test_vector_db_2",
@@ -137,7 +132,6 @@ async def test_duplicate_provider_registration(cached_disk_dist_registry):
     assert result.embedding_model == original_vector_db.embedding_model  # Original values preserved
 
 
-@pytest.mark.asyncio
 async def test_get_all_objects(cached_disk_dist_registry):
     # Create multiple test banks
     # Create multiple test banks
@@ -170,7 +164,6 @@ async def test_get_all_objects(cached_disk_dist_registry):
         assert stored_vector_db.embedding_dimension == original_vector_db.embedding_dimension
 
 
-@pytest.mark.asyncio
 async def test_parse_registry_values_error_handling(sqlite_kvstore):
     valid_db = VectorDB(
         identifier="valid_vector_db",
@@ -209,7 +202,6 @@ async def test_parse_registry_values_error_handling(sqlite_kvstore):
     assert invalid_obj is None
 
 
-@pytest.mark.asyncio
 async def test_cached_registry_error_handling(sqlite_kvstore):
     valid_db = VectorDB(
         identifier="valid_cached_db",
diff --git a/tests/unit/registry/test_registry_acl.py b/tests/unit/registry/test_registry_acl.py
index 48b3ac51b..6cfb20944 100644
--- a/tests/unit/registry/test_registry_acl.py
+++ b/tests/unit/registry/test_registry_acl.py
@@ -5,14 +5,11 @@
 # the root directory of this source tree.
 
 
-import pytest
-
 from llama_stack.apis.models import ModelType
 from llama_stack.distribution.datatypes import ModelWithOwner, User
 from llama_stack.distribution.store.registry import CachedDiskDistributionRegistry
 
 
-@pytest.mark.asyncio
 async def test_registry_cache_with_acl(cached_disk_dist_registry):
     model = ModelWithOwner(
         identifier="model-acl",
@@ -48,7 +45,6 @@ async def test_registry_cache_with_acl(cached_disk_dist_registry):
     assert new_model.owner.attributes["teams"] == ["ai-team"]
 
 
-@pytest.mark.asyncio
 async def test_registry_empty_acl(cached_disk_dist_registry):
     model = ModelWithOwner(
         identifier="model-empty-acl",
@@ -85,7 +81,6 @@ async def test_registry_empty_acl(cached_disk_dist_registry):
     assert len(all_models) == 2
 
 
-@pytest.mark.asyncio
 async def test_registry_serialization(cached_disk_dist_registry):
     attributes = {
         "roles": ["admin", "researcher"],
diff --git a/tests/unit/server/test_access_control.py b/tests/unit/server/test_access_control.py
index af03ddacb..fb9c6f95e 100644
--- a/tests/unit/server/test_access_control.py
+++ b/tests/unit/server/test_access_control.py
@@ -7,7 +7,6 @@
 from unittest.mock import MagicMock, Mock, patch
 
 import pytest
-import pytest_asyncio
 import yaml
 from pydantic import TypeAdapter, ValidationError
 
@@ -27,7 +26,7 @@ def _return_model(model):
     return model
 
 
-@pytest_asyncio.fixture
+@pytest.fixture
 async def test_setup(cached_disk_dist_registry):
     mock_inference = Mock()
     mock_inference.__provider_spec__ = MagicMock()
@@ -41,7 +40,6 @@ async def test_setup(cached_disk_dist_registry):
     yield cached_disk_dist_registry, routing_table
 
 
-@pytest.mark.asyncio
 @patch("llama_stack.distribution.routing_tables.common.get_authenticated_user")
 async def test_access_control_with_cache(mock_get_authenticated_user, test_setup):
     registry, routing_table = test_setup
@@ -106,7 +104,6 @@ async def test_access_control_with_cache(mock_get_authenticated_user, test_setup
         await routing_table.get_model("model-admin")
 
 
-@pytest.mark.asyncio
 @patch("llama_stack.distribution.routing_tables.common.get_authenticated_user")
 async def test_access_control_and_updates(mock_get_authenticated_user, test_setup):
     registry, routing_table = test_setup
@@ -145,7 +142,6 @@ async def test_access_control_and_updates(mock_get_authenticated_user, test_setu
     assert model.identifier == "model-updates"
 
 
-@pytest.mark.asyncio
 @patch("llama_stack.distribution.routing_tables.common.get_authenticated_user")
 async def test_access_control_empty_attributes(mock_get_authenticated_user, test_setup):
     registry, routing_table = test_setup
@@ -170,7 +166,6 @@ async def test_access_control_empty_attributes(mock_get_authenticated_user, test
     assert "model-empty-attrs" in model_ids
 
 
-@pytest.mark.asyncio
 @patch("llama_stack.distribution.routing_tables.common.get_authenticated_user")
 async def test_no_user_attributes(mock_get_authenticated_user, test_setup):
     registry, routing_table = test_setup
@@ -201,7 +196,6 @@ async def test_no_user_attributes(mock_get_authenticated_user, test_setup):
     assert all_models.data[0].identifier == "model-public-2"
 
 
-@pytest.mark.asyncio
 @patch("llama_stack.distribution.routing_tables.common.get_authenticated_user")
 async def test_automatic_access_attributes(mock_get_authenticated_user, test_setup):
     """Test that newly created resources inherit access attributes from their creator."""
@@ -246,7 +240,7 @@ async def test_automatic_access_attributes(mock_get_authenticated_user, test_set
     assert model.identifier == "auto-access-model"
 
 
-@pytest_asyncio.fixture
+@pytest.fixture
 async def test_setup_with_access_policy(cached_disk_dist_registry):
     mock_inference = Mock()
     mock_inference.__provider_spec__ = MagicMock()
@@ -281,7 +275,6 @@ async def test_setup_with_access_policy(cached_disk_dist_registry):
     yield routing_table
 
 
-@pytest.mark.asyncio
 @patch("llama_stack.distribution.routing_tables.common.get_authenticated_user")
 async def test_access_policy(mock_get_authenticated_user, test_setup_with_access_policy):
     routing_table = test_setup_with_access_policy
diff --git a/tests/unit/server/test_auth.py b/tests/unit/server/test_auth.py
index 39d6af1c8..7012a7f17 100644
--- a/tests/unit/server/test_auth.py
+++ b/tests/unit/server/test_auth.py
@@ -202,7 +202,6 @@ def test_http_auth_request_payload(http_client, valid_api_key, mock_auth_endpoin
         assert "param2" in payload["request"]["params"]
 
 
-@pytest.mark.asyncio
 async def test_http_middleware_with_access_attributes(mock_http_middleware, mock_scope):
     """Test HTTP middleware behavior with access attributes"""
     middleware, mock_app = mock_http_middleware
diff --git a/tests/unit/server/test_resolver.py b/tests/unit/server/test_resolver.py
index acf4da0a3..a348590b1 100644
--- a/tests/unit/server/test_resolver.py
+++ b/tests/unit/server/test_resolver.py
@@ -9,7 +9,6 @@ import sys
 from typing import Any, Protocol
 from unittest.mock import AsyncMock, MagicMock
 
-import pytest
 from pydantic import BaseModel, Field
 
 from llama_stack.apis.inference import Inference
@@ -66,7 +65,6 @@ class SampleImpl:
         pass
 
 
-@pytest.mark.asyncio
 async def test_resolve_impls_basic():
     # Create a real provider spec
     provider_spec = InlineProviderSpec(
diff --git a/tests/unit/server/test_sse.py b/tests/unit/server/test_sse.py
index 60e9f4609..d42857186 100644
--- a/tests/unit/server/test_sse.py
+++ b/tests/unit/server/test_sse.py
@@ -7,13 +7,10 @@
 import asyncio
 from unittest.mock import AsyncMock, MagicMock
 
-import pytest
-
 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.distribution.server.server import create_dynamic_typed_route, create_sse_event, sse_generator
 
 
-@pytest.mark.asyncio
 async def test_sse_generator_basic():
     # An AsyncIterator wrapped in an Awaitable, just like our web methods
     async def async_event_gen():
@@ -35,7 +32,6 @@ async def test_sse_generator_basic():
     assert seen_events[1] == create_sse_event("Test event 2")
 
 
-@pytest.mark.asyncio
 async def test_sse_generator_client_disconnected():
     # An AsyncIterator wrapped in an Awaitable, just like our web methods
     async def async_event_gen():
@@ -58,7 +54,6 @@ async def test_sse_generator_client_disconnected():
     assert seen_events[0] == create_sse_event("Test event 1")
 
 
-@pytest.mark.asyncio
 async def test_sse_generator_client_disconnected_before_response_starts():
     # Disconnect before the response starts
     async def async_event_gen():
@@ -75,7 +70,6 @@ async def test_sse_generator_client_disconnected_before_response_starts():
     assert len(seen_events) == 0
 
 
-@pytest.mark.asyncio
 async def test_sse_generator_error_before_response_starts():
     # Raise an error before the response starts
     async def async_event_gen():
@@ -93,7 +87,6 @@ async def test_sse_generator_error_before_response_starts():
     assert 'data: {"error":' in seen_events[0]
 
 
-@pytest.mark.asyncio
 async def test_paginated_response_url_setting():
     """Test that PaginatedResponse gets url set to route path."""
 
diff --git a/tests/unit/utils/inference/test_inference_store.py b/tests/unit/utils/inference/test_inference_store.py
index de619c760..730f54a05 100644
--- a/tests/unit/utils/inference/test_inference_store.py
+++ b/tests/unit/utils/inference/test_inference_store.py
@@ -42,7 +42,6 @@ def create_test_chat_completion(
     )
 
 
-@pytest.mark.asyncio
 async def test_inference_store_pagination_basic():
     """Test basic pagination functionality."""
     with TemporaryDirectory() as tmp_dir:
@@ -88,7 +87,6 @@ async def test_inference_store_pagination_basic():
         assert result3.has_more is False
 
 
-@pytest.mark.asyncio
 async def test_inference_store_pagination_ascending():
     """Test pagination with ascending order."""
     with TemporaryDirectory() as tmp_dir:
@@ -123,7 +121,6 @@ async def test_inference_store_pagination_ascending():
         assert result2.has_more is True
 
 
-@pytest.mark.asyncio
 async def test_inference_store_pagination_with_model_filter():
     """Test pagination combined with model filtering."""
     with TemporaryDirectory() as tmp_dir:
@@ -161,7 +158,6 @@ async def test_inference_store_pagination_with_model_filter():
         assert result2.has_more is False
 
 
-@pytest.mark.asyncio
 async def test_inference_store_pagination_invalid_after():
     """Test error handling for invalid 'after' parameter."""
     with TemporaryDirectory() as tmp_dir:
@@ -174,7 +170,6 @@ async def test_inference_store_pagination_invalid_after():
             await store.list_chat_completions(after="non-existent", limit=2)
 
 
-@pytest.mark.asyncio
 async def test_inference_store_pagination_no_limit():
     """Test pagination behavior when no limit is specified."""
     with TemporaryDirectory() as tmp_dir:
diff --git a/tests/unit/utils/responses/test_responses_store.py b/tests/unit/utils/responses/test_responses_store.py
index 3f25e2524..44d4b30da 100644
--- a/tests/unit/utils/responses/test_responses_store.py
+++ b/tests/unit/utils/responses/test_responses_store.py
@@ -44,7 +44,6 @@ def create_test_response_input(content: str, input_id: str) -> OpenAIResponseInp
     )
 
 
-@pytest.mark.asyncio
 async def test_responses_store_pagination_basic():
     """Test basic pagination functionality for responses store."""
     with TemporaryDirectory() as tmp_dir:
@@ -90,7 +89,6 @@ async def test_responses_store_pagination_basic():
         assert result3.has_more is False
 
 
-@pytest.mark.asyncio
 async def test_responses_store_pagination_ascending():
     """Test pagination with ascending order."""
     with TemporaryDirectory() as tmp_dir:
@@ -125,7 +123,6 @@ async def test_responses_store_pagination_ascending():
         assert result2.has_more is True
 
 
-@pytest.mark.asyncio
 async def test_responses_store_pagination_with_model_filter():
     """Test pagination combined with model filtering."""
     with TemporaryDirectory() as tmp_dir:
@@ -163,7 +160,6 @@ async def test_responses_store_pagination_with_model_filter():
         assert result2.has_more is False
 
 
-@pytest.mark.asyncio
 async def test_responses_store_pagination_invalid_after():
     """Test error handling for invalid 'after' parameter."""
     with TemporaryDirectory() as tmp_dir:
@@ -176,7 +172,6 @@ async def test_responses_store_pagination_invalid_after():
             await store.list_responses(after="non-existent", limit=2)
 
 
-@pytest.mark.asyncio
 async def test_responses_store_pagination_no_limit():
     """Test pagination behavior when no limit is specified."""
     with TemporaryDirectory() as tmp_dir:
@@ -205,7 +200,6 @@ async def test_responses_store_pagination_no_limit():
         assert result.has_more is False
 
 
-@pytest.mark.asyncio
 async def test_responses_store_get_response_object():
     """Test retrieving a single response object."""
     with TemporaryDirectory() as tmp_dir:
@@ -230,7 +224,6 @@ async def test_responses_store_get_response_object():
             await store.get_response_object("non-existent")
 
 
-@pytest.mark.asyncio
 async def test_responses_store_input_items_pagination():
     """Test pagination functionality for input items."""
     with TemporaryDirectory() as tmp_dir:
@@ -308,7 +301,6 @@ async def test_responses_store_input_items_pagination():
             await store.list_response_input_items("test-resp", before="some-id", after="other-id")
 
 
-@pytest.mark.asyncio
 async def test_responses_store_input_items_before_pagination():
     """Test before pagination functionality for input items."""
     with TemporaryDirectory() as tmp_dir:
diff --git a/tests/unit/utils/sqlstore/test_sqlstore.py b/tests/unit/utils/sqlstore/test_sqlstore.py
index c4230a396..778f0b658 100644
--- a/tests/unit/utils/sqlstore/test_sqlstore.py
+++ b/tests/unit/utils/sqlstore/test_sqlstore.py
@@ -14,7 +14,6 @@ from llama_stack.providers.utils.sqlstore.sqlalchemy_sqlstore import SqlAlchemyS
 from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
 
 
-@pytest.mark.asyncio
 async def test_sqlite_sqlstore():
     with TemporaryDirectory() as tmp_dir:
         db_name = "test.db"
@@ -66,7 +65,6 @@ async def test_sqlite_sqlstore():
         assert result.has_more is False
 
 
-@pytest.mark.asyncio
 async def test_sqlstore_pagination_basic():
     """Test basic pagination functionality at the SQL store level."""
     with TemporaryDirectory() as tmp_dir:
@@ -131,7 +129,6 @@ async def test_sqlstore_pagination_basic():
         assert result3.has_more is False
 
 
-@pytest.mark.asyncio
 async def test_sqlstore_pagination_with_filter():
     """Test pagination with WHERE conditions."""
     with TemporaryDirectory() as tmp_dir:
@@ -184,7 +181,6 @@ async def test_sqlstore_pagination_with_filter():
         assert result2.has_more is False
 
 
-@pytest.mark.asyncio
 async def test_sqlstore_pagination_ascending_order():
     """Test pagination with ascending order."""
     with TemporaryDirectory() as tmp_dir:
@@ -233,7 +229,6 @@ async def test_sqlstore_pagination_ascending_order():
         assert result2.has_more is True
 
 
-@pytest.mark.asyncio
 async def test_sqlstore_pagination_multi_column_ordering_error():
     """Test that multi-column ordering raises an error when using cursor pagination."""
     with TemporaryDirectory() as tmp_dir:
@@ -271,7 +266,6 @@ async def test_sqlstore_pagination_multi_column_ordering_error():
         assert result.data[0]["id"] == "task1"
 
 
-@pytest.mark.asyncio
 async def test_sqlstore_pagination_cursor_requires_order_by():
     """Test that cursor pagination requires order_by parameter."""
     with TemporaryDirectory() as tmp_dir:
@@ -289,7 +283,6 @@ async def test_sqlstore_pagination_cursor_requires_order_by():
             )
 
 
-@pytest.mark.asyncio
 async def test_sqlstore_pagination_error_handling():
     """Test error handling for invalid columns and cursor IDs."""
     with TemporaryDirectory() as tmp_dir:
@@ -339,7 +332,6 @@ async def test_sqlstore_pagination_error_handling():
             )
 
 
-@pytest.mark.asyncio
 async def test_sqlstore_pagination_custom_key_column():
     """Test pagination with custom primary key column (not 'id')."""
     with TemporaryDirectory() as tmp_dir:
diff --git a/tests/unit/utils/test_authorized_sqlstore.py b/tests/unit/utils/test_authorized_sqlstore.py
index 61763719a..066f67a98 100644
--- a/tests/unit/utils/test_authorized_sqlstore.py
+++ b/tests/unit/utils/test_authorized_sqlstore.py
@@ -7,8 +7,6 @@
 from tempfile import TemporaryDirectory
 from unittest.mock import patch
 
-import pytest
-
 from llama_stack.distribution.access_control.access_control import default_policy, is_action_allowed
 from llama_stack.distribution.access_control.datatypes import Action
 from llama_stack.distribution.datatypes import User
@@ -18,7 +16,6 @@ from llama_stack.providers.utils.sqlstore.sqlalchemy_sqlstore import SqlAlchemyS
 from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
 
 
-@pytest.mark.asyncio
 @patch("llama_stack.providers.utils.sqlstore.authorized_sqlstore.get_authenticated_user")
 async def test_authorized_fetch_with_where_sql_access_control(mock_get_authenticated_user):
     """Test that fetch_all works correctly with where_sql for access control"""
@@ -81,7 +78,6 @@ async def test_authorized_fetch_with_where_sql_access_control(mock_get_authentic
         assert row["title"] == "User Document"
 
 
-@pytest.mark.asyncio
 @patch("llama_stack.providers.utils.sqlstore.authorized_sqlstore.get_authenticated_user")
 async def test_sql_policy_consistency(mock_get_authenticated_user):
     """Test that SQL WHERE clause logic exactly matches is_action_allowed policy logic"""
@@ -168,7 +164,6 @@ async def test_sql_policy_consistency(mock_get_authenticated_user):
             )
 
 
-@pytest.mark.asyncio
 @patch("llama_stack.providers.utils.sqlstore.authorized_sqlstore.get_authenticated_user")
 async def test_authorized_store_user_attribute_capture(mock_get_authenticated_user):
     """Test that user attributes are properly captured during insert"""
diff --git a/uv.lock b/uv.lock
index 8374fe38a..fe50f88aa 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1394,8 +1394,8 @@ dev = [
     { name = "black" },
     { name = "nbval" },
     { name = "pre-commit" },
-    { name = "pytest" },
-    { name = "pytest-asyncio" },
+    { name = "pytest", specifier = ">=8.4" },
+    { name = "pytest-asyncio", specifier = ">=1.0" },
     { name = "pytest-cov" },
     { name = "pytest-html" },
     { name = "pytest-json-report" },
@@ -2432,29 +2432,30 @@ wheels = [
 
 [[package]]
 name = "pytest"
-version = "8.3.4"
+version = "8.4.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "colorama", marker = "sys_platform == 'win32'" },
     { name = "iniconfig" },
     { name = "packaging" },
     { name = "pluggy" },
+    { name = "pygments" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/05/35/30e0d83068951d90a01852cb1cef56e5d8a09d20c7f511634cc2f7e0372a/pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761", size = 1445919, upload-time = "2024-12-01T12:54:25.98Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/08/ba/45911d754e8eba3d5a841a5ce61a65a685ff1798421ac054f85aa8747dfb/pytest-8.4.1.tar.gz", hash = "sha256:7c67fd69174877359ed9371ec3af8a3d2b04741818c51e5e99cc1742251fa93c", size = 1517714, upload-time = "2025-06-18T05:48:06.109Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/11/92/76a1c94d3afee238333bc0a42b82935dd8f9cf8ce9e336ff87ee14d9e1cf/pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6", size = 343083, upload-time = "2024-12-01T12:54:19.735Z" },
+    { url = "https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7", size = 365474, upload-time = "2025-06-18T05:48:03.955Z" },
 ]
 
 [[package]]
 name = "pytest-asyncio"
-version = "0.25.3"
+version = "1.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "pytest" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/f2/a8/ecbc8ede70921dd2f544ab1cadd3ff3bf842af27f87bbdea774c7baa1d38/pytest_asyncio-0.25.3.tar.gz", hash = "sha256:fc1da2cf9f125ada7e710b4ddad05518d4cee187ae9412e9ac9271003497f07a", size = 54239, upload-time = "2025-01-28T18:37:58.729Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/d0/d4/14f53324cb1a6381bef29d698987625d80052bb33932d8e7cbf9b337b17c/pytest_asyncio-1.0.0.tar.gz", hash = "sha256:d15463d13f4456e1ead2594520216b225a16f781e144f8fdf6c5bb4667c48b3f", size = 46960, upload-time = "2025-05-26T04:54:40.484Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/67/17/3493c5624e48fd97156ebaec380dcaafee9506d7e2c46218ceebbb57d7de/pytest_asyncio-0.25.3-py3-none-any.whl", hash = "sha256:9e89518e0f9bd08928f97a3482fdc4e244df17529460bc038291ccaf8f85c7c3", size = 19467, upload-time = "2025-01-28T18:37:56.798Z" },
+    { url = "https://files.pythonhosted.org/packages/30/05/ce271016e351fddc8399e546f6e23761967ee09c8c568bbfbecb0c150171/pytest_asyncio-1.0.0-py3-none-any.whl", hash = "sha256:4f024da9f1ef945e680dc68610b52550e36590a67fd31bb3b4943979a1f90ef3", size = 15976, upload-time = "2025-05-26T04:54:39.035Z" },
 ]
 
 [[package]]

From aa2595c7c3a4145951acdf8c7bc5247c72ded353 Mon Sep 17 00:00:00 2001
From: Jorge Piedrahita Ortiz <jhpiedrahitao97@gmail.com>
Date: Fri, 11 Jul 2025 15:29:15 -0500
Subject: [PATCH 4/8] fix: sambanova shields and model validation (#2693)

# What does this PR do?
Update the shield register validation of Sambanova not to raise, but
only warn when a model is not available in the base url endpoint used,
also added warnings when model is not available in the base url endpoint
used

<!-- If resolving an issue, uncomment and update the line below -->
<!-- Closes #[issue-number] -->

## Test Plan
<!-- Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.* -->
run starter distro with Sambanova enabled
---
 .../remote/inference/sambanova/sambanova.py   | 21 ++++++++++++++++++-
 .../remote/safety/sambanova/sambanova.py      | 21 ++++++++++---------
 .../inference/test_openai_completion.py       |  1 -
 3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/llama_stack/providers/remote/inference/sambanova/sambanova.py b/llama_stack/providers/remote/inference/sambanova/sambanova.py
index 20f863665..9c2dda889 100644
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@@ -7,6 +7,7 @@
 import json
 from collections.abc import Iterable
 
+import requests
 from openai.types.chat import (
     ChatCompletionAssistantMessageParam as OpenAIChatCompletionAssistantMessage,
 )
@@ -56,6 +57,7 @@ from llama_stack.apis.inference import (
     ToolResponseMessage,
     UserMessage,
 )
+from llama_stack.apis.models import Model
 from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import BuiltinTool
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
@@ -176,10 +178,11 @@ class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
 
     def __init__(self, config: SambaNovaImplConfig):
         self.config = config
+        self.environment_available_models = []
         LiteLLMOpenAIMixin.__init__(
             self,
             model_entries=MODEL_ENTRIES,
-            api_key_from_config=self.config.api_key,
+            api_key_from_config=self.config.api_key.get_secret_value() if self.config.api_key else None,
             provider_data_api_key_field="sambanova_api_key",
         )
 
@@ -246,6 +249,22 @@ class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
             **get_sampling_options(request.sampling_params),
         }
 
+    async def register_model(self, model: Model) -> Model:
+        model_id = self.get_provider_model_id(model.provider_resource_id)
+
+        list_models_url = self.config.url + "/models"
+        if len(self.environment_available_models) == 0:
+            try:
+                response = requests.get(list_models_url)
+                response.raise_for_status()
+            except requests.exceptions.RequestException as e:
+                raise RuntimeError(f"Request to {list_models_url} failed") from e
+            self.environment_available_models = [model.get("id") for model in response.json().get("data", {})]
+
+        if model_id.split("sambanova/")[-1] not in self.environment_available_models:
+            logger.warning(f"Model {model_id} not available in {list_models_url}")
+        return model
+
     async def initialize(self):
         await super().initialize()
 
diff --git a/llama_stack/providers/remote/safety/sambanova/sambanova.py b/llama_stack/providers/remote/safety/sambanova/sambanova.py
index 84c8267ae..1a65f6aa1 100644
--- a/llama_stack/providers/remote/safety/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/safety/sambanova/sambanova.py
@@ -33,6 +33,7 @@ CANNED_RESPONSE_TEXT = "I can't answer that. Can I help with something else?"
 class SambaNovaSafetyAdapter(Safety, ShieldsProtocolPrivate, NeedsRequestProviderData):
     def __init__(self, config: SambaNovaSafetyConfig) -> None:
         self.config = config
+        self.environment_available_models = []
 
     async def initialize(self) -> None:
         pass
@@ -54,18 +55,18 @@ class SambaNovaSafetyAdapter(Safety, ShieldsProtocolPrivate, NeedsRequestProvide
 
     async def register_shield(self, shield: Shield) -> None:
         list_models_url = self.config.url + "/models"
-        try:
-            response = requests.get(list_models_url)
-            response.raise_for_status()
-        except requests.exceptions.RequestException as e:
-            raise RuntimeError(f"Request to {list_models_url} failed") from e
-        available_models = [model.get("id") for model in response.json().get("data", {})]
+        if len(self.environment_available_models) == 0:
+            try:
+                response = requests.get(list_models_url)
+                response.raise_for_status()
+            except requests.exceptions.RequestException as e:
+                raise RuntimeError(f"Request to {list_models_url} failed") from e
+            self.environment_available_models = [model.get("id") for model in response.json().get("data", {})]
         if (
-            len(available_models) == 0
-            or "guard" not in shield.provider_resource_id.lower()
-            or shield.provider_resource_id.split("sambanova/")[-1] not in available_models
+            "guard" not in shield.provider_resource_id.lower()
+            or shield.provider_resource_id.split("sambanova/")[-1] not in self.environment_available_models
         ):
-            raise ValueError(f"Shield {shield.provider_resource_id} not found in SambaNova")
+            logger.warning(f"Shield {shield.provider_resource_id} not available in {list_models_url}")
 
     async def run_shield(
         self, shield_id: str, messages: list[Message], params: dict[str, Any] | None = None
diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py
index 05aee5096..e82714ffd 100644
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@@ -71,7 +71,6 @@ def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, mode
         "remote::cerebras",
         "remote::databricks",
         "remote::runpod",
-        "remote::sambanova",
         "remote::tgi",
     ):
         pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI chat completions.")

From 51d9fd48083de32e9796f00030b57be10a8ac7bd Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Fri, 11 Jul 2025 16:38:27 -0400
Subject: [PATCH 5/8] fix: Don't cache clients for passthrough auth providers
 (#2728)

# What does this PR do?

Some of our inference providers support passthrough authentication via
`x-llamastack-provider-data` header values. This fixes the providers
that support passthrough auth to not cache their clients to the backend
providers (mostly OpenAI client instances) so that the client connecting
to Llama Stack has to provide those auth values on each and every
request.

## Test Plan

I added some unit tests to ensure we're not caching clients across
requests for all the fixed providers in this PR.

```
uv run pytest -sv tests/unit/providers/inference/test_inference_client_caching.py
```


I also ran some of our OpenAI compatible API integration tests for each
of the changed providers, just to ensure they still work. Note that
these providers don't actually pass all these tests (for unrelated
reasons due to quirks of the Groq and Together SaaS services), but
enough of the tests passed to confirm the clients are still working as
intended.

### Together

```
ENABLE_TOGETHER="together" \
uv run llama stack run llama_stack/templates/starter/run.yaml

LLAMA_STACK_CONFIG=http://localhost:8321 \
uv run pytest -sv \
  tests/integration/inference/test_openai_completion.py \
  --text-model "together/meta-llama/Llama-3.1-8B-Instruct"
```

### OpenAI

```
ENABLE_OPENAI="openai" \
uv run llama stack run llama_stack/templates/starter/run.yaml

LLAMA_STACK_CONFIG=http://localhost:8321 \
uv run pytest -sv \
  tests/integration/inference/test_openai_completion.py \
  --text-model "openai/gpt-4o-mini"
```

### Groq

```
ENABLE_GROQ="groq" \
uv run llama stack run llama_stack/templates/starter/run.yaml

LLAMA_STACK_CONFIG=http://localhost:8321 \
uv run pytest -sv \
  tests/integration/inference/test_openai_completion.py \
  --text-model "groq/meta-llama/Llama-3.1-8B-Instruct"
```

---------

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 .../providers/remote/inference/groq/groq.py   | 14 +--
 .../remote/inference/openai/openai.py         | 14 +--
 .../remote/inference/together/together.py     | 47 ++++------
 pyproject.toml                                |  2 +
 .../test_inference_client_caching.py          | 73 +++++++++++++++
 uv.lock                                       | 91 +++++++++++++++++++
 6 files changed, 196 insertions(+), 45 deletions(-)
 create mode 100644 tests/unit/providers/inference/test_inference_client_caching.py

diff --git a/llama_stack/providers/remote/inference/groq/groq.py b/llama_stack/providers/remote/inference/groq/groq.py
index 4b295e788..91c6b6c17 100644
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@@ -38,24 +38,18 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin):
             provider_data_api_key_field="groq_api_key",
         )
         self.config = config
-        self._openai_client = None
 
     async def initialize(self):
         await super().initialize()
 
     async def shutdown(self):
         await super().shutdown()
-        if self._openai_client:
-            await self._openai_client.close()
-            self._openai_client = None
 
     def _get_openai_client(self) -> AsyncOpenAI:
-        if not self._openai_client:
-            self._openai_client = AsyncOpenAI(
-                base_url=f"{self.config.url}/openai/v1",
-                api_key=self.config.api_key,
-            )
-        return self._openai_client
+        return AsyncOpenAI(
+            base_url=f"{self.config.url}/openai/v1",
+            api_key=self.get_api_key(),
+        )
 
     async def openai_chat_completion(
         self,
diff --git a/llama_stack/providers/remote/inference/openai/openai.py b/llama_stack/providers/remote/inference/openai/openai.py
index 72428422f..818883919 100644
--- a/llama_stack/providers/remote/inference/openai/openai.py
+++ b/llama_stack/providers/remote/inference/openai/openai.py
@@ -59,9 +59,6 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
         # if we do not set this, users will be exposed to the
         # litellm specific model names, an abstraction leak.
         self.is_openai_compat = True
-        self._openai_client = AsyncOpenAI(
-            api_key=self.config.api_key,
-        )
 
     async def initialize(self) -> None:
         await super().initialize()
@@ -69,6 +66,11 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
     async def shutdown(self) -> None:
         await super().shutdown()
 
+    def _get_openai_client(self) -> AsyncOpenAI:
+        return AsyncOpenAI(
+            api_key=self.get_api_key(),
+        )
+
     async def openai_completion(
         self,
         model: str,
@@ -120,7 +122,7 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
             user=user,
             suffix=suffix,
         )
-        return await self._openai_client.completions.create(**params)
+        return await self._get_openai_client().completions.create(**params)
 
     async def openai_chat_completion(
         self,
@@ -176,7 +178,7 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
             top_p=top_p,
             user=user,
         )
-        return await self._openai_client.chat.completions.create(**params)
+        return await self._get_openai_client().chat.completions.create(**params)
 
     async def openai_embeddings(
         self,
@@ -204,7 +206,7 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
             params["user"] = user
 
         # Call OpenAI embeddings API
-        response = await self._openai_client.embeddings.create(**params)
+        response = await self._get_openai_client().embeddings.create(**params)
 
         data = []
         for i, embedding_data in enumerate(response.data):
diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py
index 9e6877b7c..e1eb934c5 100644
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@@ -68,19 +68,12 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
     def __init__(self, config: TogetherImplConfig) -> None:
         ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
         self.config = config
-        self._client = None
-        self._openai_client = None
 
     async def initialize(self) -> None:
         pass
 
     async def shutdown(self) -> None:
-        if self._client:
-            # Together client has no close method, so just set to None
-            self._client = None
-        if self._openai_client:
-            await self._openai_client.close()
-            self._openai_client = None
+        pass
 
     async def completion(
         self,
@@ -108,29 +101,25 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
             return await self._nonstream_completion(request)
 
     def _get_client(self) -> AsyncTogether:
-        if not self._client:
-            together_api_key = None
-            config_api_key = self.config.api_key.get_secret_value() if self.config.api_key else None
-            if config_api_key:
-                together_api_key = config_api_key
-            else:
-                provider_data = self.get_request_provider_data()
-                if provider_data is None or not provider_data.together_api_key:
-                    raise ValueError(
-                        'Pass Together API Key in the header X-LlamaStack-Provider-Data as { "together_api_key": <your api key>}'
-                    )
-                together_api_key = provider_data.together_api_key
-            self._client = AsyncTogether(api_key=together_api_key)
-        return self._client
+        together_api_key = None
+        config_api_key = self.config.api_key.get_secret_value() if self.config.api_key else None
+        if config_api_key:
+            together_api_key = config_api_key
+        else:
+            provider_data = self.get_request_provider_data()
+            if provider_data is None or not provider_data.together_api_key:
+                raise ValueError(
+                    'Pass Together API Key in the header X-LlamaStack-Provider-Data as { "together_api_key": <your api key>}'
+                )
+            together_api_key = provider_data.together_api_key
+        return AsyncTogether(api_key=together_api_key)
 
     def _get_openai_client(self) -> AsyncOpenAI:
-        if not self._openai_client:
-            together_client = self._get_client().client
-            self._openai_client = AsyncOpenAI(
-                base_url=together_client.base_url,
-                api_key=together_client.api_key,
-            )
-        return self._openai_client
+        together_client = self._get_client().client
+        return AsyncOpenAI(
+            base_url=together_client.base_url,
+            api_key=together_client.api_key,
+        )
 
     async def _nonstream_completion(self, request: CompletionRequest) -> ChatCompletionResponse:
         params = await self._get_params(request)
diff --git a/pyproject.toml b/pyproject.toml
index f4115d028..9977d7372 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -87,6 +87,8 @@ unit = [
     "blobfile",
     "faiss-cpu",
     "pymilvus>=2.5.12",
+    "litellm",
+    "together",
 ]
 # These are the core dependencies required for running integration tests. They are shared across all
 # providers. If a provider requires additional dependencies, please add them to your environment
diff --git a/tests/unit/providers/inference/test_inference_client_caching.py b/tests/unit/providers/inference/test_inference_client_caching.py
new file mode 100644
index 000000000..c9a931d47
--- /dev/null
+++ b/tests/unit/providers/inference/test_inference_client_caching.py
@@ -0,0 +1,73 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+from unittest.mock import MagicMock
+
+from llama_stack.distribution.request_headers import request_provider_data_context
+from llama_stack.providers.remote.inference.groq.config import GroqConfig
+from llama_stack.providers.remote.inference.groq.groq import GroqInferenceAdapter
+from llama_stack.providers.remote.inference.openai.config import OpenAIConfig
+from llama_stack.providers.remote.inference.openai.openai import OpenAIInferenceAdapter
+from llama_stack.providers.remote.inference.together.config import TogetherImplConfig
+from llama_stack.providers.remote.inference.together.together import TogetherInferenceAdapter
+
+
+def test_groq_provider_openai_client_caching():
+    """Ensure the Groq provider does not cache api keys across client requests"""
+
+    config = GroqConfig()
+    inference_adapter = GroqInferenceAdapter(config)
+
+    inference_adapter.__provider_spec__ = MagicMock()
+    inference_adapter.__provider_spec__.provider_data_validator = (
+        "llama_stack.providers.remote.inference.groq.config.GroqProviderDataValidator"
+    )
+
+    for api_key in ["test1", "test2"]:
+        with request_provider_data_context(
+            {"x-llamastack-provider-data": json.dumps({inference_adapter.provider_data_api_key_field: api_key})}
+        ):
+            openai_client = inference_adapter._get_openai_client()
+            assert openai_client.api_key == api_key
+
+
+def test_openai_provider_openai_client_caching():
+    """Ensure the OpenAI provider does not cache api keys across client requests"""
+
+    config = OpenAIConfig()
+    inference_adapter = OpenAIInferenceAdapter(config)
+
+    inference_adapter.__provider_spec__ = MagicMock()
+    inference_adapter.__provider_spec__.provider_data_validator = (
+        "llama_stack.providers.remote.inference.openai.config.OpenAIProviderDataValidator"
+    )
+
+    for api_key in ["test1", "test2"]:
+        with request_provider_data_context(
+            {"x-llamastack-provider-data": json.dumps({inference_adapter.provider_data_api_key_field: api_key})}
+        ):
+            openai_client = inference_adapter._get_openai_client()
+            assert openai_client.api_key == api_key
+
+
+def test_together_provider_openai_client_caching():
+    """Ensure the Together provider does not cache api keys across client requests"""
+
+    config = TogetherImplConfig()
+    inference_adapter = TogetherInferenceAdapter(config)
+
+    inference_adapter.__provider_spec__ = MagicMock()
+    inference_adapter.__provider_spec__.provider_data_validator = (
+        "llama_stack.providers.remote.inference.together.TogetherProviderDataValidator"
+    )
+
+    for api_key in ["test1", "test2"]:
+        with request_provider_data_context({"x-llamastack-provider-data": json.dumps({"together_api_key": api_key})}):
+            together_client = inference_adapter._get_client()
+            assert together_client.client.api_key == api_key
+            openai_client = inference_adapter._get_openai_client()
+            assert openai_client.api_key == api_key
diff --git a/uv.lock b/uv.lock
index fe50f88aa..bca12fc51 100644
--- a/uv.lock
+++ b/uv.lock
@@ -615,6 +615,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cb/a3/460c57f094a4a165c84a1341c373b0a4f5ec6ac244b998d5021aade89b77/ecdsa-0.19.1-py2.py3-none-any.whl", hash = "sha256:30638e27cf77b7e15c4c4cc1973720149e1033827cfd00661ca5c8cc0cdb24c3", size = 150607, upload-time = "2025-03-13T11:52:41.757Z" },
 ]
 
+[[package]]
+name = "eval-type-backport"
+version = "0.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/30/ea/8b0ac4469d4c347c6a385ff09dc3c048c2d021696664e26c7ee6791631b5/eval_type_backport-0.2.2.tar.gz", hash = "sha256:f0576b4cf01ebb5bd358d02314d31846af5e07678387486e2c798af0e7d849c1", size = 9079, upload-time = "2024-12-21T20:09:46.005Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ce/31/55cd413eaccd39125368be33c46de24a1f639f2e12349b0361b4678f3915/eval_type_backport-0.2.2-py3-none-any.whl", hash = "sha256:cb6ad7c393517f476f96d456d0412ea80f0a8cf96f6892834cd9340149111b0a", size = 5830, upload-time = "2024-12-21T20:09:44.175Z" },
+]
+
 [[package]]
 name = "executing"
 version = "2.2.0"
@@ -1238,6 +1247,28 @@ version = "1.4"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/65/c6/246100fa3967074d9725b3716913bd495823547bde5047050d4c3462f994/linkify-1.4.tar.gz", hash = "sha256:9ba276ba179525f7262820d90f009604e51cd4f1466c1112b882ef7eda243d5e", size = 1749, upload-time = "2009-11-12T21:42:00.934Z" }
 
+[[package]]
+name = "litellm"
+version = "1.74.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohttp" },
+    { name = "click" },
+    { name = "httpx" },
+    { name = "importlib-metadata" },
+    { name = "jinja2" },
+    { name = "jsonschema" },
+    { name = "openai" },
+    { name = "pydantic" },
+    { name = "python-dotenv" },
+    { name = "tiktoken" },
+    { name = "tokenizers" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/38/10/63cdae1b1d581ad1db51153dfd06c4e18394a3ba8de495f73f2d797ece3b/litellm-1.74.2.tar.gz", hash = "sha256:cbacffe93976c60ca674fec0a942c70b900b4ad1c8069395174049a162f255bf", size = 9230641, upload-time = "2025-07-11T03:31:07.925Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/f7/67689245f48b9e79bcd2f3a10a3690cb1918fb99fffd5a623ed2496bca66/litellm-1.74.2-py3-none-any.whl", hash = "sha256:29bb555b45128e4cc696e72921a6ec24e97b14e9b69e86eed6f155124ad629b1", size = 8587065, upload-time = "2025-07-11T03:31:05.598Z" },
+]
+
 [[package]]
 name = "llama-stack"
 version = "0.2.14"
@@ -1341,6 +1372,7 @@ unit = [
     { name = "blobfile" },
     { name = "chardet" },
     { name = "faiss-cpu" },
+    { name = "litellm" },
     { name = "mcp" },
     { name = "openai" },
     { name = "pymilvus" },
@@ -1348,6 +1380,7 @@ unit = [
     { name = "qdrant-client" },
     { name = "sqlalchemy", extra = ["asyncio"] },
     { name = "sqlite-vec" },
+    { name = "together" },
 ]
 
 [package.metadata]
@@ -1446,6 +1479,7 @@ unit = [
     { name = "blobfile" },
     { name = "chardet" },
     { name = "faiss-cpu" },
+    { name = "litellm" },
     { name = "mcp" },
     { name = "openai" },
     { name = "pymilvus", specifier = ">=2.5.12" },
@@ -1454,6 +1488,7 @@ unit = [
     { name = "sqlalchemy" },
     { name = "sqlalchemy", extras = ["asyncio"], specifier = ">=2.0.41" },
     { name = "sqlite-vec" },
+    { name = "together" },
 ]
 
 [[package]]
@@ -2952,6 +2987,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/58/29/93c53c098d301132196c3238c312825324740851d77a8500a2462c0fd888/setuptools-80.8.0-py3-none-any.whl", hash = "sha256:95a60484590d24103af13b686121328cc2736bee85de8936383111e421b9edc0", size = 1201470, upload-time = "2025-05-20T14:02:51.348Z" },
 ]
 
+[[package]]
+name = "shellingham"
+version = "1.5.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload-time = "2023-10-24T04:13:40.426Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" },
+]
+
 [[package]]
 name = "six"
 version = "1.17.0"
@@ -3384,6 +3428,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b2/fe/81695a1aa331a842b582453b605175f419fe8540355886031328089d840a/sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8", size = 6189177, upload-time = "2024-07-19T09:26:48.863Z" },
 ]
 
+[[package]]
+name = "tabulate"
+version = "0.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" },
+]
+
 [[package]]
 name = "tenacity"
 version = "9.1.2"
@@ -3426,6 +3479,29 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/de/a8/8f499c179ec900783ffe133e9aab10044481679bb9aad78436d239eee716/tiktoken-0.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:5ea0edb6f83dc56d794723286215918c1cde03712cbbafa0348b33448faf5b95", size = 894669, upload-time = "2025-02-14T06:02:47.341Z" },
 ]
 
+[[package]]
+name = "together"
+version = "1.5.21"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohttp" },
+    { name = "click" },
+    { name = "eval-type-backport" },
+    { name = "filelock" },
+    { name = "numpy" },
+    { name = "pillow" },
+    { name = "pydantic" },
+    { name = "requests" },
+    { name = "rich" },
+    { name = "tabulate" },
+    { name = "tqdm" },
+    { name = "typer" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ea/53/e33c5e6d53c2e2bbd07f9dcb1979e27ac670fca0e4e238b169aa4c358ee2/together-1.5.21.tar.gz", hash = "sha256:59adb8cf4c5b77eca76b8c66a73c47c45fd828aaf4f059f33f893f8c5f68f85a", size = 69887, upload-time = "2025-07-10T21:04:43.781Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fc/31/6556a303ea39929fa016f4260eef289b620cf366a576c304507cb75b4d12/together-1.5.21-py3-none-any.whl", hash = "sha256:35e6c0072033a2e5f1105de8781e969f41cffc85dae508b6f4dc293360026872", size = 96141, upload-time = "2025-07-10T21:04:42.418Z" },
+]
+
 [[package]]
 name = "tokenizers"
 version = "0.21.1"
@@ -3644,6 +3720,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/aa/22/733a6fc4a6445d835242f64c490fdd30f4a08d58f2b788613de3f9170692/transformers-4.50.3-py3-none-any.whl", hash = "sha256:6111610a43dec24ef32c3df0632c6b25b07d9711c01d9e1077bdd2ff6b14a38c", size = 10180411, upload-time = "2025-03-28T18:20:59.265Z" },
 ]
 
+[[package]]
+name = "typer"
+version = "0.15.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "rich" },
+    { name = "shellingham" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6c/89/c527e6c848739be8ceb5c44eb8208c52ea3515c6cf6406aa61932887bf58/typer-0.15.4.tar.gz", hash = "sha256:89507b104f9b6a0730354f27c39fae5b63ccd0c95b1ce1f1a6ba0cfd329997c3", size = 101559, upload-time = "2025-05-14T16:34:57.704Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c9/62/d4ba7afe2096d5659ec3db8b15d8665bdcb92a3c6ff0b95e99895b335a9c/typer-0.15.4-py3-none-any.whl", hash = "sha256:eb0651654dcdea706780c466cf06d8f174405a659ffff8f163cfbfee98c0e173", size = 45258, upload-time = "2025-05-14T16:34:55.583Z" },
+]
+
 [[package]]
 name = "types-requests"
 version = "2.32.0.20241016"

From 8374d4cefd3e59c4dad68ec2842622b5f4154fd5 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 12 Jul 2025 16:23:42 -0400
Subject: [PATCH 6/8] chore(github-deps): bump medyagh/setup-minikube from
 0.0.19 to 0.0.20 (#2738)

---
 .github/workflows/integration-auth-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration-auth-tests.yml b/.github/workflows/integration-auth-tests.yml
index 7822e4216..cf10e005c 100644
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@@ -35,7 +35,7 @@ jobs:
 
       - name: Install minikube
         if: ${{ matrix.auth-provider == 'kubernetes' }}
-        uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # v0.0.19
+        uses: medyagh/setup-minikube@e3c7f79eb1e997eabccc536a6cf318a2b0fe19d9 # v0.0.20
 
       - name: Start minikube
         if: ${{ matrix.auth-provider == 'oauth2_token' }}

From 68e7978c8890fd0aec901e0871fb92061a0d8fa5 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Sat, 12 Jul 2025 19:53:54 -0400
Subject: [PATCH 7/8] chore: block network access from unit tests (#2732)

# What does this PR do?
this blocks network access for all `tests/unit/` tests.
`tests/integration/` are untouched.

it also introduces an `allow_network` marker to explicitly allow network
access.

## Test Plan
`./scripts/unit-tests.sh`
---
 pyproject.toml                                     |  4 ++++
 tests/unit/conftest.py                             | 11 +++++++++++
 tests/unit/providers/inference/test_remote_vllm.py |  1 +
 tests/unit/rag/test_vector_store.py                |  2 ++
 uv.lock                                            | 14 ++++++++++++++
 5 files changed, 32 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 9977d7372..2974ff996 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -64,6 +64,7 @@ dev = [
     "pytest-cov",
     "pytest-html",
     "pytest-json-report",
+    "pytest-socket",      # For blocking network access in unit tests
     "nbval",              # For notebook testing
     "black",
     "ruff",
@@ -344,3 +345,6 @@ classmethod-decorators = ["classmethod", "pydantic.field_validator"]
 
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
+markers = [
+    "allow_network: Allow network access for specific unit tests",
+]
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index aedac0386..b5eb1217d 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -4,6 +4,17 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import pytest_socket
+
 # We need to import the fixtures here so that pytest can find them
 # but ruff doesn't think they are used and removes the import. "noqa: F401" prevents them from being removed
 from .fixtures import cached_disk_dist_registry, disk_dist_registry, sqlite_kvstore  # noqa: F401
+
+
+def pytest_runtest_setup(item):
+    """Setup for each test - check if network access should be allowed."""
+    if "allow_network" in item.keywords:
+        pytest_socket.enable_socket()
+    else:
+        # Allowing Unix sockets is necessary for some tests that use local servers and mocks
+        pytest_socket.disable_socket(allow_unix_socket=True)
diff --git a/tests/unit/providers/inference/test_remote_vllm.py b/tests/unit/providers/inference/test_remote_vllm.py
index ca44cc95d..5c2ad03ab 100644
--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@@ -393,6 +393,7 @@ async def test_process_vllm_chat_completion_stream_response_no_choices():
     assert chunks[0].event.event_type.value == "start"
 
 
+@pytest.mark.allow_network
 def test_chat_completion_doesnt_block_event_loop(caplog):
     loop = asyncio.new_event_loop()
     loop.set_debug(True)
diff --git a/tests/unit/rag/test_vector_store.py b/tests/unit/rag/test_vector_store.py
index dd36d3992..919f97ba7 100644
--- a/tests/unit/rag/test_vector_store.py
+++ b/tests/unit/rag/test_vector_store.py
@@ -123,6 +123,7 @@ class TestVectorStore:
         content = await content_from_doc(doc)
         assert content in DUMMY_PDF_TEXT_CHOICES
 
+    @pytest.mark.allow_network
     async def test_downloads_pdf_and_returns_content(self):
         # Using GitHub to host the PDF file
         url = "https://raw.githubusercontent.com/meta-llama/llama-stack/da035d69cfca915318eaf485770a467ca3c2a238/llama_stack/providers/tests/memory/fixtures/dummy.pdf"
@@ -135,6 +136,7 @@ class TestVectorStore:
         content = await content_from_doc(doc)
         assert content in DUMMY_PDF_TEXT_CHOICES
 
+    @pytest.mark.allow_network
     async def test_downloads_pdf_and_returns_content_with_url_object(self):
         # Using GitHub to host the PDF file
         url = "https://raw.githubusercontent.com/meta-llama/llama-stack/da035d69cfca915318eaf485770a467ca3c2a238/llama_stack/providers/tests/memory/fixtures/dummy.pdf"
diff --git a/uv.lock b/uv.lock
index bca12fc51..83e502e7f 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1324,6 +1324,7 @@ dev = [
     { name = "pytest-cov" },
     { name = "pytest-html" },
     { name = "pytest-json-report" },
+    { name = "pytest-socket" },
     { name = "pytest-timeout" },
     { name = "ruamel-yaml" },
     { name = "ruff" },
@@ -1432,6 +1433,7 @@ dev = [
     { name = "pytest-cov" },
     { name = "pytest-html" },
     { name = "pytest-json-report" },
+    { name = "pytest-socket" },
     { name = "pytest-timeout" },
     { name = "ruamel-yaml" },
     { name = "ruff" },
@@ -2545,6 +2547,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3e/43/7e7b2ec865caa92f67b8f0e9231a798d102724ca4c0e1f414316be1c1ef2/pytest_metadata-3.1.1-py3-none-any.whl", hash = "sha256:c8e0844db684ee1c798cfa38908d20d67d0463ecb6137c72e91f418558dd5f4b", size = 11428, upload-time = "2024-02-12T19:38:42.531Z" },
 ]
 
+[[package]]
+name = "pytest-socket"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/05/ff/90c7e1e746baf3d62ce864c479fd53410b534818b9437413903596f81580/pytest_socket-0.7.0.tar.gz", hash = "sha256:71ab048cbbcb085c15a4423b73b619a8b35d6a307f46f78ea46be51b1b7e11b3", size = 12389, upload-time = "2024-01-28T20:17:23.177Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/19/58/5d14cb5cb59409e491ebe816c47bf81423cd03098ea92281336320ae5681/pytest_socket-0.7.0-py3-none-any.whl", hash = "sha256:7e0f4642177d55d317bbd58fc68c6bd9048d6eadb2d46a89307fa9221336ce45", size = 6754, upload-time = "2024-01-28T20:17:22.105Z" },
+]
+
 [[package]]
 name = "pytest-timeout"
 version = "2.4.0"

From 958fc92b1bc99ba8e57e0819696f74a7e09f45f0 Mon Sep 17 00:00:00 2001
From: Francisco Arceo <arceofrancisco@gmail.com>
Date: Sun, 13 Jul 2025 04:03:55 -0400
Subject: [PATCH 8/8] feat: Add Vector stores UI (#2737)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?
- Adds two pages to UI
  - Vector stores
  - Vector store detail view
- Fixed darkmode navbar highlighting
- Updated darkmode font color
- Updated llama-stack-client package

<img width="1916" height="734" alt="Screenshot 2025-07-12 at 11 34
35 PM"
src="https://github.com/user-attachments/assets/3f9b6727-ee82-4e6b-9555-2e3ef36d24d2"
/>

<img width="1912" height="910" alt="Screenshot 2025-07-12 at 11 57
09 PM"
src="https://github.com/user-attachments/assets/0c9d3b5e-5592-4dfb-8e04-a57edc9fb406"
/>


## Test Plan
<!-- Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.* -->

---------

Signed-off-by: Francisco Javier Arceo <farceo@redhat.com>
---
 .../ui/app/logs/vector-stores/[id]/page.tsx   |  82 +++
 .../ui/app/logs/vector-stores/layout.tsx      |  16 +
 .../ui/app/logs/vector-stores/page.tsx        | 121 +++++
 .../ui/components/layout/app-sidebar.tsx      |  16 +-
 .../ui/components/layout/detail-layout.tsx    |   8 +-
 .../ui/components/ui/message-components.tsx   |   4 +-
 .../vector-stores/vector-store-detail.tsx     | 128 +++++
 llama_stack/ui/package-lock.json              | 474 +-----------------
 llama_stack/ui/package.json                   |   2 +-
 9 files changed, 378 insertions(+), 473 deletions(-)
 create mode 100644 llama_stack/ui/app/logs/vector-stores/[id]/page.tsx
 create mode 100644 llama_stack/ui/app/logs/vector-stores/layout.tsx
 create mode 100644 llama_stack/ui/app/logs/vector-stores/page.tsx
 create mode 100644 llama_stack/ui/components/vector-stores/vector-store-detail.tsx

diff --git a/llama_stack/ui/app/logs/vector-stores/[id]/page.tsx b/llama_stack/ui/app/logs/vector-stores/[id]/page.tsx
new file mode 100644
index 000000000..f27c9d802
--- /dev/null
+++ b/llama_stack/ui/app/logs/vector-stores/[id]/page.tsx
@@ -0,0 +1,82 @@
+"use client";
+
+import { useEffect, useState } from "react";
+import { useParams, useRouter } from "next/navigation";
+import { useAuthClient } from "@/hooks/use-auth-client";
+import type { VectorStore } from "llama-stack-client/resources/vector-stores/vector-stores";
+import type { VectorStoreFile } from "llama-stack-client/resources/vector-stores/files";
+import { VectorStoreDetailView } from "@/components/vector-stores/vector-store-detail";
+
+export default function VectorStoreDetailPage() {
+  const params = useParams();
+  const id = params.id as string;
+  const client = useAuthClient();
+  const router = useRouter();
+
+  const [store, setStore] = useState<VectorStore | null>(null);
+  const [files, setFiles] = useState<VectorStoreFile[]>([]);
+  const [isLoadingStore, setIsLoadingStore] = useState(true);
+  const [isLoadingFiles, setIsLoadingFiles] = useState(true);
+  const [errorStore, setErrorStore] = useState<Error | null>(null);
+  const [errorFiles, setErrorFiles] = useState<Error | null>(null);
+
+  useEffect(() => {
+    if (!id) {
+      setErrorStore(new Error("Vector Store ID is missing."));
+      setIsLoadingStore(false);
+      return;
+    }
+    const fetchStore = async () => {
+      setIsLoadingStore(true);
+      setErrorStore(null);
+      try {
+        const response = await client.vectorStores.retrieve(id);
+        setStore(response as VectorStore);
+      } catch (err) {
+        setErrorStore(
+          err instanceof Error
+            ? err
+            : new Error("Failed to load vector store."),
+        );
+      } finally {
+        setIsLoadingStore(false);
+      }
+    };
+    fetchStore();
+  }, [id, client]);
+
+  useEffect(() => {
+    if (!id) {
+      setErrorFiles(new Error("Vector Store ID is missing."));
+      setIsLoadingFiles(false);
+      return;
+    }
+    const fetchFiles = async () => {
+      setIsLoadingFiles(true);
+      setErrorFiles(null);
+      try {
+        const result = await client.vectorStores.files.list(id as any);
+        setFiles((result as any).data);
+      } catch (err) {
+        setErrorFiles(
+          err instanceof Error ? err : new Error("Failed to load files."),
+        );
+      } finally {
+        setIsLoadingFiles(false);
+      }
+    };
+    fetchFiles();
+  }, [id]);
+
+  return (
+    <VectorStoreDetailView
+      store={store}
+      files={files}
+      isLoadingStore={isLoadingStore}
+      isLoadingFiles={isLoadingFiles}
+      errorStore={errorStore}
+      errorFiles={errorFiles}
+      id={id}
+    />
+  );
+}
diff --git a/llama_stack/ui/app/logs/vector-stores/layout.tsx b/llama_stack/ui/app/logs/vector-stores/layout.tsx
new file mode 100644
index 000000000..9245f5486
--- /dev/null
+++ b/llama_stack/ui/app/logs/vector-stores/layout.tsx
@@ -0,0 +1,16 @@
+"use client";
+
+import React from "react";
+import LogsLayout from "@/components/layout/logs-layout";
+
+export default function VectorStoresLayout({
+  children,
+}: {
+  children: React.ReactNode;
+}) {
+  return (
+    <LogsLayout sectionLabel="Vector Stores" basePath="/logs/vector-stores">
+      {children}
+    </LogsLayout>
+  );
+}
diff --git a/llama_stack/ui/app/logs/vector-stores/page.tsx b/llama_stack/ui/app/logs/vector-stores/page.tsx
new file mode 100644
index 000000000..29e1fabd6
--- /dev/null
+++ b/llama_stack/ui/app/logs/vector-stores/page.tsx
@@ -0,0 +1,121 @@
+"use client";
+
+import React from "react";
+import { useAuthClient } from "@/hooks/use-auth-client";
+import type {
+  ListVectorStoresResponse,
+  VectorStore,
+} from "llama-stack-client/resources/vector-stores/vector-stores";
+import { useRouter } from "next/navigation";
+import { usePagination } from "@/hooks/use-pagination";
+import {
+  Table,
+  TableBody,
+  TableCaption,
+  TableCell,
+  TableHead,
+  TableHeader,
+  TableRow,
+} from "@/components/ui/table";
+import { Skeleton } from "@/components/ui/skeleton";
+
+export default function VectorStoresPage() {
+  const client = useAuthClient();
+  const router = useRouter();
+  const {
+    data: stores,
+    status,
+    hasMore,
+    error,
+    loadMore,
+  } = usePagination<VectorStore>({
+    limit: 20,
+    order: "desc",
+    fetchFunction: async (client, params) => {
+      const response = await client.vectorStores.list({
+        after: params.after,
+        limit: params.limit,
+        order: params.order,
+      } as any);
+      return response as ListVectorStoresResponse;
+    },
+    errorMessagePrefix: "vector stores",
+  });
+
+  // Auto-load all pages for infinite scroll behavior (like Responses)
+  React.useEffect(() => {
+    if (status === "idle" && hasMore) {
+      loadMore();
+    }
+  }, [status, hasMore, loadMore]);
+
+  if (status === "loading") {
+    return (
+      <div className="space-y-2">
+        <Skeleton className="h-8 w-full" />
+        <Skeleton className="h-4 w-full" />
+        <Skeleton className="h-4 w-full" />
+      </div>
+    );
+  }
+
+  if (status === "error") {
+    return <div className="text-destructive">Error: {error?.message}</div>;
+  }
+
+  if (!stores || stores.length === 0) {
+    return <p>No vector stores found.</p>;
+  }
+
+  return (
+    <div className="overflow-auto flex-1 min-h-0">
+      <Table>
+        <TableHeader>
+          <TableRow>
+            <TableHead>ID</TableHead>
+            <TableHead>Name</TableHead>
+            <TableHead>Created</TableHead>
+            <TableHead>Completed</TableHead>
+            <TableHead>Cancelled</TableHead>
+            <TableHead>Failed</TableHead>
+            <TableHead>In Progress</TableHead>
+            <TableHead>Total</TableHead>
+            <TableHead>Usage Bytes</TableHead>
+            <TableHead>Provider ID</TableHead>
+            <TableHead>Provider Vector DB ID</TableHead>
+          </TableRow>
+        </TableHeader>
+        <TableBody>
+          {stores.map((store) => {
+            const fileCounts = store.file_counts;
+            const metadata = store.metadata || {};
+            const providerId = metadata.provider_id ?? "";
+            const providerDbId = metadata.provider_vector_db_id ?? "";
+
+            return (
+              <TableRow
+                key={store.id}
+                onClick={() => router.push(`/logs/vector-stores/${store.id}`)}
+                className="cursor-pointer hover:bg-muted/50"
+              >
+                <TableCell>{store.id}</TableCell>
+                <TableCell>{store.name}</TableCell>
+                <TableCell>
+                  {new Date(store.created_at * 1000).toLocaleString()}
+                </TableCell>
+                <TableCell>{fileCounts.completed}</TableCell>
+                <TableCell>{fileCounts.cancelled}</TableCell>
+                <TableCell>{fileCounts.failed}</TableCell>
+                <TableCell>{fileCounts.in_progress}</TableCell>
+                <TableCell>{fileCounts.total}</TableCell>
+                <TableCell>{store.usage_bytes}</TableCell>
+                <TableCell>{providerId}</TableCell>
+                <TableCell>{providerDbId}</TableCell>
+              </TableRow>
+            );
+          })}
+        </TableBody>
+      </Table>
+    </div>
+  );
+}
diff --git a/llama_stack/ui/components/layout/app-sidebar.tsx b/llama_stack/ui/components/layout/app-sidebar.tsx
index 1c53d6cc5..532e43dbd 100644
--- a/llama_stack/ui/components/layout/app-sidebar.tsx
+++ b/llama_stack/ui/components/layout/app-sidebar.tsx
@@ -1,6 +1,11 @@
 "use client";
 
-import { MessageSquareText, MessagesSquare, MoveUpRight } from "lucide-react";
+import {
+  MessageSquareText,
+  MessagesSquare,
+  MoveUpRight,
+  Database,
+} from "lucide-react";
 import Link from "next/link";
 import { usePathname } from "next/navigation";
 import { cn } from "@/lib/utils";
@@ -28,6 +33,11 @@ const logItems = [
     url: "/logs/responses",
     icon: MessagesSquare,
   },
+  {
+    title: "Vector Stores",
+    url: "/logs/vector-stores",
+    icon: Database,
+  },
   {
     title: "Documentation",
     url: "https://llama-stack.readthedocs.io/en/latest/references/api_reference/index.html",
@@ -57,13 +67,13 @@ export function AppSidebar() {
                       className={cn(
                         "justify-start",
                         isActive &&
-                          "bg-gray-200 hover:bg-gray-200 text-primary hover:text-primary",
+                          "bg-gray-200 dark:bg-gray-700 hover:bg-gray-200 dark:hover:bg-gray-700 text-gray-900 dark:text-gray-100",
                       )}
                     >
                       <Link href={item.url}>
                         <item.icon
                           className={cn(
-                            isActive && "text-primary",
+                            isActive && "text-gray-900 dark:text-gray-100",
                             "mr-2 h-4 w-4",
                           )}
                         />
diff --git a/llama_stack/ui/components/layout/detail-layout.tsx b/llama_stack/ui/components/layout/detail-layout.tsx
index 58b912703..3013195a2 100644
--- a/llama_stack/ui/components/layout/detail-layout.tsx
+++ b/llama_stack/ui/components/layout/detail-layout.tsx
@@ -93,7 +93,9 @@ export function PropertyItem({
     >
       <strong>{label}:</strong>{" "}
       {typeof value === "string" || typeof value === "number" ? (
-        <span className="text-gray-900 font-medium">{value}</span>
+        <span className="text-gray-900 dark:text-gray-100 font-medium">
+          {value}
+        </span>
       ) : (
         value
       )}
@@ -112,7 +114,9 @@ export function PropertiesCard({ children }: PropertiesCardProps) {
         <CardTitle>Properties</CardTitle>
       </CardHeader>
       <CardContent>
-        <ul className="space-y-2 text-sm text-gray-600">{children}</ul>
+        <ul className="space-y-2 text-sm text-gray-600 dark:text-gray-400">
+          {children}
+        </ul>
       </CardContent>
     </Card>
   );
diff --git a/llama_stack/ui/components/ui/message-components.tsx b/llama_stack/ui/components/ui/message-components.tsx
index 50ccd623e..39cb570b7 100644
--- a/llama_stack/ui/components/ui/message-components.tsx
+++ b/llama_stack/ui/components/ui/message-components.tsx
@@ -17,10 +17,10 @@ export const MessageBlock: React.FC<MessageBlockProps> = ({
 }) => {
   return (
     <div className={`mb-4 ${className}`}>
-      <p className="py-1 font-semibold text-gray-800 mb-1">
+      <p className="py-1 font-semibold text-muted-foreground mb-1">
         {label}
         {labelDetail && (
-          <span className="text-xs text-gray-500 font-normal ml-1">
+          <span className="text-xs text-muted-foreground font-normal ml-1">
             {labelDetail}
           </span>
         )}
diff --git a/llama_stack/ui/components/vector-stores/vector-store-detail.tsx b/llama_stack/ui/components/vector-stores/vector-store-detail.tsx
new file mode 100644
index 000000000..7c5c91dd3
--- /dev/null
+++ b/llama_stack/ui/components/vector-stores/vector-store-detail.tsx
@@ -0,0 +1,128 @@
+"use client";
+
+import type { VectorStore } from "llama-stack-client/resources/vector-stores/vector-stores";
+import type { VectorStoreFile } from "llama-stack-client/resources/vector-stores/files";
+import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card";
+import { Skeleton } from "@/components/ui/skeleton";
+import {
+  DetailLoadingView,
+  DetailErrorView,
+  DetailNotFoundView,
+  DetailLayout,
+  PropertiesCard,
+  PropertyItem,
+} from "@/components/layout/detail-layout";
+import {
+  Table,
+  TableBody,
+  TableCaption,
+  TableCell,
+  TableHead,
+  TableHeader,
+  TableRow,
+} from "@/components/ui/table";
+
+interface VectorStoreDetailViewProps {
+  store: VectorStore | null;
+  files: VectorStoreFile[];
+  isLoadingStore: boolean;
+  isLoadingFiles: boolean;
+  errorStore: Error | null;
+  errorFiles: Error | null;
+  id: string;
+}
+
+export function VectorStoreDetailView({
+  store,
+  files,
+  isLoadingStore,
+  isLoadingFiles,
+  errorStore,
+  errorFiles,
+  id,
+}: VectorStoreDetailViewProps) {
+  const title = "Vector Store Details";
+
+  if (errorStore) {
+    return <DetailErrorView title={title} id={id} error={errorStore} />;
+  }
+  if (isLoadingStore) {
+    return <DetailLoadingView title={title} />;
+  }
+  if (!store) {
+    return <DetailNotFoundView title={title} id={id} />;
+  }
+
+  const mainContent = (
+    <>
+      <Card>
+        <CardHeader>
+          <CardTitle>Files</CardTitle>
+        </CardHeader>
+        <CardContent>
+          {isLoadingFiles ? (
+            <Skeleton className="h-4 w-full" />
+          ) : errorFiles ? (
+            <div className="text-destructive text-sm">
+              Error loading files: {errorFiles.message}
+            </div>
+          ) : files.length > 0 ? (
+            <Table>
+              <TableCaption>Files in this vector store</TableCaption>
+              <TableHeader>
+                <TableRow>
+                  <TableHead>ID</TableHead>
+                  <TableHead>Status</TableHead>
+                  <TableHead>Created</TableHead>
+                  <TableHead>Usage Bytes</TableHead>
+                </TableRow>
+              </TableHeader>
+              <TableBody>
+                {files.map((file) => (
+                  <TableRow key={file.id}>
+                    <TableCell>{file.id}</TableCell>
+                    <TableCell>{file.status}</TableCell>
+                    <TableCell>
+                      {new Date(file.created_at * 1000).toLocaleString()}
+                    </TableCell>
+                    <TableCell>{file.usage_bytes}</TableCell>
+                  </TableRow>
+                ))}
+              </TableBody>
+            </Table>
+          ) : (
+            <p className="text-gray-500 italic text-sm">
+              No files in this vector store.
+            </p>
+          )}
+        </CardContent>
+      </Card>
+    </>
+  );
+
+  const sidebar = (
+    <PropertiesCard>
+      <PropertyItem label="ID" value={store.id} />
+      <PropertyItem label="Name" value={store.name || ""} />
+      <PropertyItem
+        label="Created"
+        value={new Date(store.created_at * 1000).toLocaleString()}
+      />
+      <PropertyItem label="Status" value={store.status} />
+      <PropertyItem label="Total Files" value={store.file_counts.total} />
+      <PropertyItem label="Usage Bytes" value={store.usage_bytes} />
+      <PropertyItem
+        label="Provider ID"
+        value={(store.metadata.provider_id as string) || ""}
+      />
+      <PropertyItem
+        label="Provider DB ID"
+        value={(store.metadata.provider_vector_db_id as string) || ""}
+      />
+    </PropertiesCard>
+  );
+
+  return (
+    <DetailLayout title={title} mainContent={mainContent} sidebar={sidebar} />
+  );
+}
diff --git a/llama_stack/ui/package-lock.json b/llama_stack/ui/package-lock.json
index 8fd5fb56c..158569241 100644
--- a/llama_stack/ui/package-lock.json
+++ b/llama_stack/ui/package-lock.json
@@ -15,7 +15,7 @@
         "@radix-ui/react-tooltip": "^1.2.6",
         "class-variance-authority": "^0.7.1",
         "clsx": "^2.1.1",
-        "llama-stack-client": "0.2.13",
+        "llama-stack-client": "^0.2.14",
         "lucide-react": "^0.510.0",
         "next": "15.3.3",
         "next-auth": "^4.24.11",
@@ -676,406 +676,6 @@
         "tslib": "^2.4.0"
       }
     },
-    "node_modules/@esbuild/aix-ppc64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.25.5.tgz",
-      "integrity": "sha512-9o3TMmpmftaCMepOdA5k/yDw8SfInyzWWTjYTFCX3kPSDJMROQTb8jg+h9Cnwnmm1vOzvxN7gIfB5V2ewpjtGA==",
-      "cpu": [
-        "ppc64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "aix"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/android-arm": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.25.5.tgz",
-      "integrity": "sha512-AdJKSPeEHgi7/ZhuIPtcQKr5RQdo6OO2IL87JkianiMYMPbCtot9fxPbrMiBADOWWm3T2si9stAiVsGbTQFkbA==",
-      "cpu": [
-        "arm"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "android"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/android-arm64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.25.5.tgz",
-      "integrity": "sha512-VGzGhj4lJO+TVGV1v8ntCZWJktV7SGCs3Pn1GRWI1SBFtRALoomm8k5E9Pmwg3HOAal2VDc2F9+PM/rEY6oIDg==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "android"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/android-x64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.25.5.tgz",
-      "integrity": "sha512-D2GyJT1kjvO//drbRT3Hib9XPwQeWd9vZoBJn+bu/lVsOZ13cqNdDeqIF/xQ5/VmWvMduP6AmXvylO/PIc2isw==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "android"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/darwin-arm64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.25.5.tgz",
-      "integrity": "sha512-GtaBgammVvdF7aPIgH2jxMDdivezgFu6iKpmT+48+F8Hhg5J/sfnDieg0aeG/jfSvkYQU2/pceFPDKlqZzwnfQ==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "darwin"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/darwin-x64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.25.5.tgz",
-      "integrity": "sha512-1iT4FVL0dJ76/q1wd7XDsXrSW+oLoquptvh4CLR4kITDtqi2e/xwXwdCVH8hVHU43wgJdsq7Gxuzcs6Iq/7bxQ==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "darwin"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/freebsd-arm64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.25.5.tgz",
-      "integrity": "sha512-nk4tGP3JThz4La38Uy/gzyXtpkPW8zSAmoUhK9xKKXdBCzKODMc2adkB2+8om9BDYugz+uGV7sLmpTYzvmz6Sw==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "freebsd"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/freebsd-x64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.25.5.tgz",
-      "integrity": "sha512-PrikaNjiXdR2laW6OIjlbeuCPrPaAl0IwPIaRv+SMV8CiM8i2LqVUHFC1+8eORgWyY7yhQY+2U2fA55mBzReaw==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "freebsd"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/linux-arm": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.25.5.tgz",
-      "integrity": "sha512-cPzojwW2okgh7ZlRpcBEtsX7WBuqbLrNXqLU89GxWbNt6uIg78ET82qifUy3W6OVww6ZWobWub5oqZOVtwolfw==",
-      "cpu": [
-        "arm"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/linux-arm64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.25.5.tgz",
-      "integrity": "sha512-Z9kfb1v6ZlGbWj8EJk9T6czVEjjq2ntSYLY2cw6pAZl4oKtfgQuS4HOq41M/BcoLPzrUbNd+R4BXFyH//nHxVg==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/linux-ia32": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.25.5.tgz",
-      "integrity": "sha512-sQ7l00M8bSv36GLV95BVAdhJ2QsIbCuCjh/uYrWiMQSUuV+LpXwIqhgJDcvMTj+VsQmqAHL2yYaasENvJ7CDKA==",
-      "cpu": [
-        "ia32"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/linux-loong64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.25.5.tgz",
-      "integrity": "sha512-0ur7ae16hDUC4OL5iEnDb0tZHDxYmuQyhKhsPBV8f99f6Z9KQM02g33f93rNH5A30agMS46u2HP6qTdEt6Q1kg==",
-      "cpu": [
-        "loong64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/linux-mips64el": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.25.5.tgz",
-      "integrity": "sha512-kB/66P1OsHO5zLz0i6X0RxlQ+3cu0mkxS3TKFvkb5lin6uwZ/ttOkP3Z8lfR9mJOBk14ZwZ9182SIIWFGNmqmg==",
-      "cpu": [
-        "mips64el"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/linux-ppc64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.25.5.tgz",
-      "integrity": "sha512-UZCmJ7r9X2fe2D6jBmkLBMQetXPXIsZjQJCjgwpVDz+YMcS6oFR27alkgGv3Oqkv07bxdvw7fyB71/olceJhkQ==",
-      "cpu": [
-        "ppc64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/linux-riscv64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.25.5.tgz",
-      "integrity": "sha512-kTxwu4mLyeOlsVIFPfQo+fQJAV9mh24xL+y+Bm6ej067sYANjyEw1dNHmvoqxJUCMnkBdKpvOn0Ahql6+4VyeA==",
-      "cpu": [
-        "riscv64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/linux-s390x": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.25.5.tgz",
-      "integrity": "sha512-K2dSKTKfmdh78uJ3NcWFiqyRrimfdinS5ErLSn3vluHNeHVnBAFWC8a4X5N+7FgVE1EjXS1QDZbpqZBjfrqMTQ==",
-      "cpu": [
-        "s390x"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/linux-x64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.25.5.tgz",
-      "integrity": "sha512-uhj8N2obKTE6pSZ+aMUbqq+1nXxNjZIIjCjGLfsWvVpy7gKCOL6rsY1MhRh9zLtUtAI7vpgLMK6DxjO8Qm9lJw==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/netbsd-arm64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.25.5.tgz",
-      "integrity": "sha512-pwHtMP9viAy1oHPvgxtOv+OkduK5ugofNTVDilIzBLpoWAM16r7b/mxBvfpuQDpRQFMfuVr5aLcn4yveGvBZvw==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "netbsd"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/netbsd-x64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.25.5.tgz",
-      "integrity": "sha512-WOb5fKrvVTRMfWFNCroYWWklbnXH0Q5rZppjq0vQIdlsQKuw6mdSihwSo4RV/YdQ5UCKKvBy7/0ZZYLBZKIbwQ==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "netbsd"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/openbsd-arm64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.25.5.tgz",
-      "integrity": "sha512-7A208+uQKgTxHd0G0uqZO8UjK2R0DDb4fDmERtARjSHWxqMTye4Erz4zZafx7Di9Cv+lNHYuncAkiGFySoD+Mw==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "openbsd"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/openbsd-x64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.25.5.tgz",
-      "integrity": "sha512-G4hE405ErTWraiZ8UiSoesH8DaCsMm0Cay4fsFWOOUcz8b8rC6uCvnagr+gnioEjWn0wC+o1/TAHt+It+MpIMg==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "openbsd"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/sunos-x64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.25.5.tgz",
-      "integrity": "sha512-l+azKShMy7FxzY0Rj4RCt5VD/q8mG/e+mDivgspo+yL8zW7qEwctQ6YqKX34DTEleFAvCIUviCFX1SDZRSyMQA==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "sunos"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/win32-arm64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.25.5.tgz",
-      "integrity": "sha512-O2S7SNZzdcFG7eFKgvwUEZ2VG9D/sn/eIiz8XRZ1Q/DO5a3s76Xv0mdBzVM5j5R639lXQmPmSo0iRpHqUUrsxw==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "win32"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/win32-ia32": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.25.5.tgz",
-      "integrity": "sha512-onOJ02pqs9h1iMJ1PQphR+VZv8qBMQ77Klcsqv9CNW2w6yLqoURLcgERAIurY6QE63bbLuqgP9ATqajFLK5AMQ==",
-      "cpu": [
-        "ia32"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "win32"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/win32-x64": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.25.5.tgz",
-      "integrity": "sha512-TXv6YnJ8ZMVdX+SXWVBo/0p8LTcrUYngpWjvm91TMjjBQii7Oz11Lw5lbDV5Y0TzuhSJHwiH4hEtC1I42mMS0g==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "win32"
-      ],
-      "engines": {
-        "node": ">=18"
-      }
-    },
     "node_modules/@eslint-community/eslint-utils": {
       "version": "4.7.0",
       "resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.7.0.tgz",
@@ -5999,46 +5599,6 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
-    "node_modules/esbuild": {
-      "version": "0.25.5",
-      "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.25.5.tgz",
-      "integrity": "sha512-P8OtKZRv/5J5hhz0cUAdu/cLuPIKXpQl1R9pZtvmHWQvrAUVd0UNIPT4IB4W3rNOqVO0rlqHmCIbSwxh/c9yUQ==",
-      "hasInstallScript": true,
-      "license": "MIT",
-      "bin": {
-        "esbuild": "bin/esbuild"
-      },
-      "engines": {
-        "node": ">=18"
-      },
-      "optionalDependencies": {
-        "@esbuild/aix-ppc64": "0.25.5",
-        "@esbuild/android-arm": "0.25.5",
-        "@esbuild/android-arm64": "0.25.5",
-        "@esbuild/android-x64": "0.25.5",
-        "@esbuild/darwin-arm64": "0.25.5",
-        "@esbuild/darwin-x64": "0.25.5",
-        "@esbuild/freebsd-arm64": "0.25.5",
-        "@esbuild/freebsd-x64": "0.25.5",
-        "@esbuild/linux-arm": "0.25.5",
-        "@esbuild/linux-arm64": "0.25.5",
-        "@esbuild/linux-ia32": "0.25.5",
-        "@esbuild/linux-loong64": "0.25.5",
-        "@esbuild/linux-mips64el": "0.25.5",
-        "@esbuild/linux-ppc64": "0.25.5",
-        "@esbuild/linux-riscv64": "0.25.5",
-        "@esbuild/linux-s390x": "0.25.5",
-        "@esbuild/linux-x64": "0.25.5",
-        "@esbuild/netbsd-arm64": "0.25.5",
-        "@esbuild/netbsd-x64": "0.25.5",
-        "@esbuild/openbsd-arm64": "0.25.5",
-        "@esbuild/openbsd-x64": "0.25.5",
-        "@esbuild/sunos-x64": "0.25.5",
-        "@esbuild/win32-arm64": "0.25.5",
-        "@esbuild/win32-ia32": "0.25.5",
-        "@esbuild/win32-x64": "0.25.5"
-      }
-    },
     "node_modules/escalade": {
       "version": "3.2.0",
       "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
@@ -6993,6 +6553,7 @@
       "version": "2.3.3",
       "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
       "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
+      "dev": true,
       "hasInstallScript": true,
       "license": "MIT",
       "optional": true,
@@ -7154,6 +6715,7 @@
       "version": "4.10.0",
       "resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.10.0.tgz",
       "integrity": "sha512-kGzZ3LWWQcGIAmg6iWvXn0ei6WDtV26wzHRMwDSzmAbcXrTEXxHy6IehI6/4eT6VRKyMP1eF1VqwrVUmE/LR7A==",
+      "dev": true,
       "license": "MIT",
       "dependencies": {
         "resolve-pkg-maps": "^1.0.0"
@@ -9537,9 +9099,10 @@
       "license": "MIT"
     },
     "node_modules/llama-stack-client": {
-      "version": "0.2.13",
-      "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.13.tgz",
-      "integrity": "sha512-R1rTFLwgUimr+KjEUkzUvFL6vLASwS9qj3UDSVkJ5BmrKAs5GwVAMeL7yZaTBXGuPUVh124WSlC4d9H0FjWqLA==",
+      "version": "0.2.14",
+      "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.14.tgz",
+      "integrity": "sha512-bVU3JHp+EPEKR0Vb9vcd9ZyQj/72jSDuptKLwOXET9WrkphIQ8xuW5ueecMTgq8UEls3lwB3HiZM2cDOR9eDsQ==",
+      "license": "Apache-2.0",
       "dependencies": {
         "@types/node": "^18.11.18",
         "@types/node-fetch": "^2.6.4",
@@ -9547,8 +9110,7 @@
         "agentkeepalive": "^4.2.1",
         "form-data-encoder": "1.7.2",
         "formdata-node": "^4.3.2",
-        "node-fetch": "^2.6.7",
-        "tsx": "^4.19.2"
+        "node-fetch": "^2.6.7"
       }
     },
     "node_modules/llama-stack-client/node_modules/@types/node": {
@@ -11148,6 +10710,7 @@
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz",
       "integrity": "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==",
+      "dev": true,
       "license": "MIT",
       "funding": {
         "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1"
@@ -12198,25 +11761,6 @@
       "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
       "license": "0BSD"
     },
-    "node_modules/tsx": {
-      "version": "4.19.4",
-      "resolved": "https://registry.npmjs.org/tsx/-/tsx-4.19.4.tgz",
-      "integrity": "sha512-gK5GVzDkJK1SI1zwHf32Mqxf2tSJkNx+eYcNly5+nHvWqXUJYUkWBQtKauoESz3ymezAI++ZwT855x5p5eop+Q==",
-      "license": "MIT",
-      "dependencies": {
-        "esbuild": "~0.25.0",
-        "get-tsconfig": "^4.7.5"
-      },
-      "bin": {
-        "tsx": "dist/cli.mjs"
-      },
-      "engines": {
-        "node": ">=18.0.0"
-      },
-      "optionalDependencies": {
-        "fsevents": "~2.3.3"
-      }
-    },
     "node_modules/tw-animate-css": {
       "version": "1.2.9",
       "resolved": "https://registry.npmjs.org/tw-animate-css/-/tw-animate-css-1.2.9.tgz",
diff --git a/llama_stack/ui/package.json b/llama_stack/ui/package.json
index 9524ce0a5..b38efe309 100644
--- a/llama_stack/ui/package.json
+++ b/llama_stack/ui/package.json
@@ -20,7 +20,7 @@
     "@radix-ui/react-tooltip": "^1.2.6",
     "class-variance-authority": "^0.7.1",
     "clsx": "^2.1.1",
-    "llama-stack-client": "0.2.13",
+    "llama-stack-client": "^0.2.14",
     "lucide-react": "^0.510.0",
     "next": "15.3.3",
     "next-auth": "^4.24.11",