From 091d8c48f217b413fa267a3c0412c2967be601cd Mon Sep 17 00:00:00 2001
From: grs <gsim@redhat.com>
Date: Tue, 20 May 2025 22:45:11 -0400
Subject: [PATCH 1/7] feat: add additional auth provider that uses oauth token
 introspection (#2187)

# What does this PR do?

This adds an alternative option to the oauth_token auth provider that
can be used with existing authorization services which support token
introspection as defined in RFC 7662. This could be useful where token
revocation needs to be handled or where opaque tokens (or other non jwt
formatted tokens) are used

## Test Plan
Tested against keycloak

Signed-off-by: Gordon Sim <gsim@redhat.com>
---
 llama_stack/distribution/datatypes.py         |   2 +-
 .../distribution/server/auth_providers.py     | 100 +++++++++--
 tests/unit/server/test_auth.py                | 162 +++++++++++++++++-
 3 files changed, 251 insertions(+), 13 deletions(-)

diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index 446a88ca0..be5629ba1 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -229,7 +229,7 @@ class AuthenticationConfig(BaseModel):
         ...,
         description="Type of authentication provider (e.g., 'kubernetes', 'custom')",
     )
-    config: dict[str, str] = Field(
+    config: dict[str, Any] = Field(
         ...,
         description="Provider-specific configuration",
     )
diff --git a/llama_stack/distribution/server/auth_providers.py b/llama_stack/distribution/server/auth_providers.py
index b73fded58..baab75eca 100644
--- a/llama_stack/distribution/server/auth_providers.py
+++ b/llama_stack/distribution/server/auth_providers.py
@@ -5,15 +5,18 @@
 # the root directory of this source tree.
 
 import json
+import ssl
 import time
 from abc import ABC, abstractmethod
 from asyncio import Lock
 from enum import Enum
+from typing import Any
 from urllib.parse import parse_qs
 
 import httpx
 from jose import jwt
-from pydantic import BaseModel, Field, field_validator
+from pydantic import BaseModel, Field, field_validator, model_validator
+from typing_extensions import Self
 
 from llama_stack.distribution.datatypes import AccessAttributes
 from llama_stack.log import get_logger
@@ -85,7 +88,7 @@ class AuthProviderConfig(BaseModel):
     """Base configuration for authentication providers."""
 
     provider_type: AuthProviderType = Field(..., description="Type of authentication provider")
-    config: dict[str, str] = Field(..., description="Provider-specific configuration")
+    config: dict[str, Any] = Field(..., description="Provider-specific configuration")
 
 
 class AuthProvider(ABC):
@@ -198,10 +201,21 @@ def get_attributes_from_claims(claims: dict[str, str], mapping: dict[str, str])
     return attributes
 
 
-class OAuth2TokenAuthProviderConfig(BaseModel):
+class OAuth2JWKSConfig(BaseModel):
     # The JWKS URI for collecting public keys
-    jwks_uri: str
+    uri: str
     cache_ttl: int = 3600
+
+
+class OAuth2IntrospectionConfig(BaseModel):
+    url: str
+    client_id: str
+    client_secret: str
+    send_secret_in_body: bool = False
+    tls_cafile: str | None = None
+
+
+class OAuth2TokenAuthProviderConfig(BaseModel):
     audience: str = "llama-stack"
     claims_mapping: dict[str, str] = Field(
         default_factory=lambda: {
@@ -214,6 +228,8 @@ class OAuth2TokenAuthProviderConfig(BaseModel):
             "namespace": "namespaces",
         },
     )
+    jwks: OAuth2JWKSConfig | None
+    introspection: OAuth2IntrospectionConfig | None = None
 
     @classmethod
     @field_validator("claims_mapping")
@@ -225,6 +241,14 @@ class OAuth2TokenAuthProviderConfig(BaseModel):
                 raise ValueError(f"claims_mapping value is not a valid attribute: {value}")
         return v
 
+    @model_validator(mode="after")
+    def validate_mode(self) -> Self:
+        if not self.jwks and not self.introspection:
+            raise ValueError("One of jwks or introspection must be configured")
+        if self.jwks and self.introspection:
+            raise ValueError("At present only one of jwks or introspection should be configured")
+        return self
+
 
 class OAuth2TokenAuthProvider(AuthProvider):
     """
@@ -240,8 +264,17 @@ class OAuth2TokenAuthProvider(AuthProvider):
         self._jwks_lock = Lock()
 
     async def validate_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
+        if self.config.jwks:
+            return await self.validate_jwt_token(token, self.config.jwks, scope)
+        if self.config.introspection:
+            return await self.introspect_token(token, self.config.introspection, scope)
+        raise ValueError("One of jwks or introspection must be configured")
+
+    async def validate_jwt_token(
+        self, token: str, config: OAuth2JWKSConfig, scope: dict | None = None
+    ) -> TokenValidationResult:
         """Validate a token using the JWT token."""
-        await self._refresh_jwks()
+        await self._refresh_jwks(config)
 
         try:
             header = jwt.get_unverified_header(token)
@@ -269,14 +302,61 @@ class OAuth2TokenAuthProvider(AuthProvider):
             access_attributes=access_attributes,
         )
 
-    async def close(self):
-        """Close the HTTP client."""
+    async def introspect_token(
+        self, token: str, config: OAuth2IntrospectionConfig, scope: dict | None = None
+    ) -> TokenValidationResult:
+        """Validate a token using token introspection as defined by RFC 7662."""
+        form = {
+            "token": token,
+        }
+        if config.send_secret_in_body:
+            form["client_id"] = config.client_id
+            form["client_secret"] = config.client_secret
+            auth = None
+        else:
+            auth = (config.client_id, config.client_secret)
+        ssl_ctxt = None
+        if config.tls_cafile:
+            ssl_ctxt = ssl.create_default_context(cafile=config.tls_cafile)
+        try:
+            async with httpx.AsyncClient(verify=ssl_ctxt) as client:
+                response = await client.post(
+                    config.url,
+                    data=form,
+                    auth=auth,
+                    timeout=10.0,  # Add a reasonable timeout
+                )
+                if response.status_code != 200:
+                    logger.warning(f"Token introspection failed with status code: {response.status_code}")
+                    raise ValueError(f"Token introspection failed: {response.status_code}")
 
-    async def _refresh_jwks(self) -> None:
+                fields = response.json()
+                if not fields["active"]:
+                    raise ValueError("Token not active")
+                principal = fields["sub"] or fields["username"]
+                access_attributes = get_attributes_from_claims(fields, self.config.claims_mapping)
+                return TokenValidationResult(
+                    principal=principal,
+                    access_attributes=access_attributes,
+                )
+        except httpx.TimeoutException:
+            logger.exception("Token introspection request timed out")
+            raise
+        except ValueError:
+            # Re-raise ValueError exceptions to preserve their message
+            raise
+        except Exception as e:
+            logger.exception("Error during token introspection")
+            raise ValueError("Token introspection error") from e
+
+    async def close(self):
+        pass
+
+    async def _refresh_jwks(self, config: OAuth2JWKSConfig) -> None:
         async with self._jwks_lock:
-            if time.time() - self._jwks_at > self.config.cache_ttl:
+            if time.time() - self._jwks_at > config.cache_ttl:
                 async with httpx.AsyncClient() as client:
-                    res = await client.get(self.config.jwks_uri, timeout=5)
+                    res = await client.get(config.uri, timeout=5)
                     res.raise_for_status()
                     jwks_data = res.json()["keys"]
                     updated = {}
diff --git a/tests/unit/server/test_auth.py b/tests/unit/server/test_auth.py
index f15ca9de4..56458c0e7 100644
--- a/tests/unit/server/test_auth.py
+++ b/tests/unit/server/test_auth.py
@@ -396,8 +396,10 @@ def oauth2_app():
     auth_config = AuthProviderConfig(
         provider_type=AuthProviderType.OAUTH2_TOKEN,
         config={
-            "jwks_uri": "http://mock-authz-service/token/introspect",
-            "cache_ttl": "3600",
+            "jwks": {
+                "uri": "http://mock-authz-service/token/introspect",
+                "cache_ttl": "3600",
+            },
             "audience": "llama-stack",
         },
     )
@@ -517,3 +519,159 @@ def test_get_attributes_from_claims():
 
 
 # TODO: add more tests for oauth2 token provider
+
+
+# oauth token introspection tests
+@pytest.fixture
+def mock_introspection_endpoint():
+    return "http://mock-authz-service/token/introspect"
+
+
+@pytest.fixture
+def introspection_app(mock_introspection_endpoint):
+    app = FastAPI()
+    auth_config = AuthProviderConfig(
+        provider_type=AuthProviderType.OAUTH2_TOKEN,
+        config={
+            "jwks": None,
+            "introspection": {"url": mock_introspection_endpoint, "client_id": "myclient", "client_secret": "abcdefg"},
+        },
+    )
+    app.add_middleware(AuthenticationMiddleware, auth_config=auth_config)
+
+    @app.get("/test")
+    def test_endpoint():
+        return {"message": "Authentication successful"}
+
+    return app
+
+
+@pytest.fixture
+def introspection_app_with_custom_mapping(mock_introspection_endpoint):
+    app = FastAPI()
+    auth_config = AuthProviderConfig(
+        provider_type=AuthProviderType.OAUTH2_TOKEN,
+        config={
+            "jwks": None,
+            "introspection": {
+                "url": mock_introspection_endpoint,
+                "client_id": "myclient",
+                "client_secret": "abcdefg",
+                "send_secret_in_body": "true",
+            },
+            "claims_mapping": {
+                "sub": "roles",
+                "scope": "roles",
+                "groups": "teams",
+                "aud": "namespaces",
+            },
+        },
+    )
+    app.add_middleware(AuthenticationMiddleware, auth_config=auth_config)
+
+    @app.get("/test")
+    def test_endpoint():
+        return {"message": "Authentication successful"}
+
+    return app
+
+
+@pytest.fixture
+def introspection_client(introspection_app):
+    return TestClient(introspection_app)
+
+
+@pytest.fixture
+def introspection_client_with_custom_mapping(introspection_app_with_custom_mapping):
+    return TestClient(introspection_app_with_custom_mapping)
+
+
+def test_missing_auth_header_introspection(introspection_client):
+    response = introspection_client.get("/test")
+    assert response.status_code == 401
+    assert "Missing or invalid Authorization header" in response.json()["error"]["message"]
+
+
+def test_invalid_auth_header_format_introspection(introspection_client):
+    response = introspection_client.get("/test", headers={"Authorization": "InvalidFormat token123"})
+    assert response.status_code == 401
+    assert "Missing or invalid Authorization header" in response.json()["error"]["message"]
+
+
+async def mock_introspection_active(*args, **kwargs):
+    return MockResponse(
+        200,
+        {
+            "active": True,
+            "sub": "my-user",
+            "groups": ["group1", "group2"],
+            "scope": "foo bar",
+            "aud": ["set1", "set2"],
+        },
+    )
+
+
+async def mock_introspection_inactive(*args, **kwargs):
+    return MockResponse(
+        200,
+        {
+            "active": False,
+        },
+    )
+
+
+async def mock_introspection_invalid(*args, **kwargs):
+    class InvalidResponse:
+        def __init__(self, status_code):
+            self.status_code = status_code
+
+        def json(self):
+            raise ValueError("Not JSON")
+
+    return InvalidResponse(200)
+
+
+async def mock_introspection_failed(*args, **kwargs):
+    return MockResponse(
+        500,
+        {},
+    )
+
+
+@patch("httpx.AsyncClient.post", new=mock_introspection_active)
+def test_valid_introspection_authentication(introspection_client, valid_api_key):
+    response = introspection_client.get("/test", headers={"Authorization": f"Bearer {valid_api_key}"})
+    assert response.status_code == 200
+    assert response.json() == {"message": "Authentication successful"}
+
+
+@patch("httpx.AsyncClient.post", new=mock_introspection_inactive)
+def test_inactive_introspection_authentication(introspection_client, invalid_api_key):
+    response = introspection_client.get("/test", headers={"Authorization": f"Bearer {invalid_api_key}"})
+    assert response.status_code == 401
+    assert "Token not active" in response.json()["error"]["message"]
+
+
+@patch("httpx.AsyncClient.post", new=mock_introspection_invalid)
+def test_invalid_introspection_authentication(introspection_client, invalid_api_key):
+    response = introspection_client.get("/test", headers={"Authorization": f"Bearer {invalid_api_key}"})
+    assert response.status_code == 401
+    assert "Not JSON" in response.json()["error"]["message"]
+
+
+@patch("httpx.AsyncClient.post", new=mock_introspection_failed)
+def test_failed_introspection_authentication(introspection_client, invalid_api_key):
+    response = introspection_client.get("/test", headers={"Authorization": f"Bearer {invalid_api_key}"})
+    assert response.status_code == 401
+    assert "Token introspection failed: 500" in response.json()["error"]["message"]
+
+
+@patch("httpx.AsyncClient.post", new=mock_introspection_active)
+def test_valid_introspection_with_custom_mapping_authentication(
+    introspection_client_with_custom_mapping, valid_api_key
+):
+    response = introspection_client_with_custom_mapping.get(
+        "/test", headers={"Authorization": f"Bearer {valid_api_key}"}
+    )
+    assert response.status_code == 200
+    assert response.json() == {"message": "Authentication successful"}

From 5a3d777b20ea19870cc4ffec70af31055f1aacbc Mon Sep 17 00:00:00 2001
From: Abhishek koserwal <abhishek.koserwal@gmail.com>
Date: Wed, 21 May 2025 13:55:51 +0530
Subject: [PATCH 2/7] feat: add llama stack rm command (#2127)

# What does this PR do?
[Provide a short summary of what this PR does and why. Link to relevant
issues if applicable.]

```
llama stack rm llamastack-test
```

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])
#225

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)
---
 docs/source/distributions/building_distro.md |  42 +++++++
 llama_stack/cli/stack/list_stacks.py         |  56 +++++++++
 llama_stack/cli/stack/remove.py              | 116 +++++++++++++++++++
 llama_stack/cli/stack/stack.py               |   5 +-
 4 files changed, 218 insertions(+), 1 deletion(-)
 create mode 100644 llama_stack/cli/stack/list_stacks.py
 create mode 100644 llama_stack/cli/stack/remove.py

diff --git a/docs/source/distributions/building_distro.md b/docs/source/distributions/building_distro.md
index d9b73c910..0dbabf8aa 100644
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@@ -338,6 +338,48 @@ INFO:     Application startup complete.
 INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
 INFO:     2401:db00:35c:2d2b:face:0:c9:0:54678 - "GET /models/list HTTP/1.1" 200 OK
 ```
+### Listing Distributions
+Using the list command, you can view all existing Llama Stack distributions, including stacks built from templates, from scratch, or using custom configuration files.
+
+```
+llama stack list -h
+usage: llama stack list [-h]
+
+list the build stacks
+
+options:
+  -h, --help  show this help message and exit
+```
+
+Example Usage
+
+```
+llama stack list
+```
+
+### Removing a Distribution
+Use the remove command to delete a distribution you've previously built.
+
+```
+llama stack rm -h
+usage: llama stack rm [-h] [--all] [name]
+
+Remove the build stack
+
+positional arguments:
+  name        Name of the stack to delete (default: None)
+
+options:
+  -h, --help  show this help message and exit
+  --all, -a   Delete all stacks (use with caution) (default: False)
+```
+
+Example
+```
+llama stack rm llamastack-test
+```
+
+To keep your environment organized and avoid clutter, consider using `llama stack list` to review old or unused distributions and `llama stack rm <name>` to delete them when they’re no longer needed.
 
 ### Troubleshooting
 
diff --git a/llama_stack/cli/stack/list_stacks.py b/llama_stack/cli/stack/list_stacks.py
new file mode 100644
index 000000000..2ea0fdeea
--- /dev/null
+++ b/llama_stack/cli/stack/list_stacks.py
@@ -0,0 +1,56 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+from pathlib import Path
+
+from llama_stack.cli.subcommand import Subcommand
+from llama_stack.cli.table import print_table
+
+
+class StackListBuilds(Subcommand):
+    """List built stacks in .llama/distributions directory"""
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "list",
+            prog="llama stack list",
+            description="list the build stacks",
+            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._list_stack_command)
+
+    def _get_distribution_dirs(self) -> dict[str, Path]:
+        """Return a dictionary of distribution names and their paths"""
+        distributions = {}
+        dist_dir = Path.home() / ".llama" / "distributions"
+
+        if dist_dir.exists():
+            for stack_dir in dist_dir.iterdir():
+                if stack_dir.is_dir():
+                    distributions[stack_dir.name] = stack_dir
+        return distributions
+
+    def _list_stack_command(self, args: argparse.Namespace) -> None:
+        distributions = self._get_distribution_dirs()
+
+        if not distributions:
+            print("No stacks found in ~/.llama/distributions")
+            return
+
+        headers = ["Stack Name", "Path"]
+        headers.extend(["Build Config", "Run Config"])
+        rows = []
+        for name, path in distributions.items():
+            row = [name, str(path)]
+            # Check for build and run config files
+            build_config = "Yes" if (path / f"{name}-build.yaml").exists() else "No"
+            run_config = "Yes" if (path / f"{name}-run.yaml").exists() else "No"
+            row.extend([build_config, run_config])
+            rows.append(row)
+        print_table(rows, headers, separate_rows=True)
diff --git a/llama_stack/cli/stack/remove.py b/llama_stack/cli/stack/remove.py
new file mode 100644
index 000000000..be7c49a5d
--- /dev/null
+++ b/llama_stack/cli/stack/remove.py
@@ -0,0 +1,116 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+import shutil
+import sys
+from pathlib import Path
+
+from termcolor import cprint
+
+from llama_stack.cli.subcommand import Subcommand
+from llama_stack.cli.table import print_table
+
+
+class StackRemove(Subcommand):
+    """Remove the build stack"""
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "rm",
+            prog="llama stack rm",
+            description="Remove the build stack",
+            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._remove_stack_build_command)
+
+    def _add_arguments(self) -> None:
+        self.parser.add_argument(
+            "name",
+            type=str,
+            nargs="?",
+            help="Name of the stack to delete",
+        )
+        self.parser.add_argument(
+            "--all",
+            "-a",
+            action="store_true",
+            help="Delete all stacks (use with caution)",
+        )
+
+    def _get_distribution_dirs(self) -> dict[str, Path]:
+        """Return a dictionary of distribution names and their paths"""
+        distributions = {}
+        dist_dir = Path.home() / ".llama" / "distributions"
+
+        if dist_dir.exists():
+            for stack_dir in dist_dir.iterdir():
+                if stack_dir.is_dir():
+                    distributions[stack_dir.name] = stack_dir
+        return distributions
+
+    def _list_stacks(self) -> None:
+        """Display available stacks in a table"""
+        distributions = self._get_distribution_dirs()
+        if not distributions:
+            print("No stacks found in ~/.llama/distributions")
+            return
+
+        headers = ["Stack Name", "Path"]
+        rows = [[name, str(path)] for name, path in distributions.items()]
+        print_table(rows, headers, separate_rows=True)
+
+    def _remove_stack_build_command(self, args: argparse.Namespace) -> None:
+        distributions = self._get_distribution_dirs()
+
+        if args.all:
+            confirm = input("Are you sure you want to delete ALL stacks? [yes-i-really-want/N] ").lower()
+            if confirm != "yes-i-really-want":
+                print("Deletion cancelled.")
+                return
+
+            for name, path in distributions.items():
+                try:
+                    shutil.rmtree(path)
+                    print(f"Deleted stack: {name}")
+                except Exception as e:
+                    cprint(
+                        f"Failed to delete stack {name}: {e}",
+                        color="red",
+                    )
+                    sys.exit(2)
+
+        if not args.name:
+            self._list_stacks()
+            if not args.name:
+                return
+
+        if args.name not in distributions:
+            self._list_stacks()
+            cprint(
+                f"Stack not found: {args.name}",
+                color="red",
+            )
+            return
+
+        stack_path = distributions[args.name]
+
+        confirm = input(f"Are you sure you want to delete stack '{args.name}'? [y/N] ").lower()
+        if confirm != "y":
+            print("Deletion cancelled.")
+            return
+
+        try:
+            shutil.rmtree(stack_path)
+            print(f"Successfully deleted stack: {args.name}")
+        except Exception as e:
+            cprint(
+                f"Failed to delete stack {args.name}: {e}",
+                color="red",
+            )
+            sys.exit(2)
diff --git a/llama_stack/cli/stack/stack.py b/llama_stack/cli/stack/stack.py
index ccf1a5ffc..3aff78e23 100644
--- a/llama_stack/cli/stack/stack.py
+++ b/llama_stack/cli/stack/stack.py
@@ -7,12 +7,14 @@
 import argparse
 from importlib.metadata import version
 
+from llama_stack.cli.stack.list_stacks import StackListBuilds
 from llama_stack.cli.stack.utils import print_subcommand_description
 from llama_stack.cli.subcommand import Subcommand
 
 from .build import StackBuild
 from .list_apis import StackListApis
 from .list_providers import StackListProviders
+from .remove import StackRemove
 from .run import StackRun
 
 
@@ -41,5 +43,6 @@ class StackParser(Subcommand):
         StackListApis.create(subparsers)
         StackListProviders.create(subparsers)
         StackRun.create(subparsers)
-
+        StackRemove.create(subparsers)
+        StackListBuilds.create(subparsers)
         print_subcommand_description(self.parser, subparsers)

From 2890243107c74a7a88b82595db49e9540d0a0561 Mon Sep 17 00:00:00 2001
From: liangwen12year <36004580+liangwen12year@users.noreply.github.com>
Date: Wed, 21 May 2025 04:58:45 -0400
Subject: [PATCH 3/7] =?UTF-8?q?feat(quota):=20add=20server=E2=80=91side=20?=
 =?UTF-8?q?per=E2=80=91client=20request=20quotas=20(requires=20auth)=20(#2?=
 =?UTF-8?q?096)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?
feat(quota): add server‑side per‑client request quotas (requires auth)

Unrestricted usage can lead to runaway costs and fragmented client-side
    workarounds. This commit introduces a native quota mechanism to the
    server, giving operators a unified, centrally managed throttle for
    per-client requests—without needing extra proxies or custom client
logic. This helps contain cloud-compute expenses, enables fine-grained
usage control, and simplifies deployment and monitoring of Llama Stack
services. Quotas are fully opt-in and have no effect unless explicitly
    configured.

    Notice that Quotas are fully opt-in and require authentication to be
enabled. The 'sqlite' is the only supported quota `type` at this time,
any other `type` will be rejected. And the only supported `period` is
    'day'.

    Highlights:

    - Adds `QuotaMiddleware` to enforce per-client request quotas:
      - Uses `Authorization: Bearer <client_id>` (from
        AuthenticationMiddleware)
      - Tracks usage via a SQLite-based KV store
      - Returns 429 when the quota is exceeded

    - Extends `ServerConfig` with a `quota` section (type + config)

- Enforces strict coupling: quotas require authentication or the server
      will fail to start

    Behavior changes:
    - Quotas are disabled by default unless explicitly configured
    - SQLite defaults to `./quotas.db` if no DB path is set
    - The server requires authentication when quotas are enabled

    To enable per-client request quotas in `run.yaml`, add:
    ```
    server:
      port: 8321
      auth:
        provider_type: "custom"
        config:
          endpoint: "https://auth.example.com/validate"
      quota:
        type: sqlite
        config:
          db_path: ./quotas.db
          limit:
            max_requests: 1000
            period: day

[//]: # (If resolving an issue, uncomment and update the line below)
Closes #2093

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)

Signed-off-by: Wen Liang <wenliang@redhat.com>
Co-authored-by: Wen Liang <wenliang@redhat.com>
---
 docs/source/distributions/configuration.md |  74 ++++++++++++
 llama_stack/distribution/datatypes.py      |  19 ++-
 llama_stack/distribution/server/auth.py    |   4 +
 llama_stack/distribution/server/quota.py   | 110 ++++++++++++++++++
 llama_stack/distribution/server/server.py  |  30 +++++
 tests/unit/server/test_quota.py            | 127 +++++++++++++++++++++
 6 files changed, 363 insertions(+), 1 deletion(-)
 create mode 100644 llama_stack/distribution/server/quota.py
 create mode 100644 tests/unit/server/test_quota.py

diff --git a/docs/source/distributions/configuration.md b/docs/source/distributions/configuration.md
index b62227a84..7a42f503a 100644
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@@ -208,6 +208,80 @@ And must respond with:
 
 If no access attributes are returned, the token is used as a namespace.
 
+### Quota Configuration
+
+The `quota` section allows you to enable server-side request throttling for both
+authenticated and anonymous clients. This is useful for preventing abuse, enforcing
+fairness across tenants, and controlling infrastructure costs without requiring
+client-side rate limiting or external proxies.
+
+Quotas are disabled by default. When enabled, each client is tracked using either:
+
+* Their authenticated `client_id` (derived from the Bearer token), or
+* Their IP address (fallback for anonymous requests)
+
+Quota state is stored in a SQLite-backed key-value store, and rate limits are applied
+within a configurable time window (currently only `day` is supported).
+
+#### Example
+
+```yaml
+server:
+  quota:
+    kvstore:
+      type: sqlite
+      db_path: ./quotas.db
+    anonymous_max_requests: 100
+    authenticated_max_requests: 1000
+    period: day
+```
+
+#### Configuration Options
+
+| Field                        | Description                                                                |
+| ---------------------------- | -------------------------------------------------------------------------- |
+| `kvstore`                    | Required. Backend storage config for tracking request counts.              |
+| `kvstore.type`               | Must be `"sqlite"` for now. Other backends may be supported in the future. |
+| `kvstore.db_path`            | File path to the SQLite database.                                          |
+| `anonymous_max_requests`     | Max requests per period for unauthenticated clients.                       |
+| `authenticated_max_requests` | Max requests per period for authenticated clients.                         |
+| `period`                     | Time window for quota enforcement. Only `"day"` is supported.              |
+
+> Note: if `authenticated_max_requests` is set but no authentication provider is
+configured, the server will fall back to applying `anonymous_max_requests` to all
+clients.
+
+#### Example with Authentication Enabled
+
+```yaml
+server:
+  port: 8321
+  auth:
+    provider_type: custom
+    config:
+      endpoint: https://auth.example.com/validate
+  quota:
+    kvstore:
+      type: sqlite
+      db_path: ./quotas.db
+    anonymous_max_requests: 100
+    authenticated_max_requests: 1000
+    period: day
+```
+
+If a client exceeds their limit, the server responds with:
+
+```http
+HTTP/1.1 429 Too Many Requests
+Content-Type: application/json
+
+{
+  "error": {
+    "message": "Quota exceeded"
+  }
+}
+```
+
 ## Extending to handle Safety
 
 Configuring Safety can be a little involved so it is instructive to go through an example.
diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index be5629ba1..ca3664828 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -25,7 +25,7 @@ from llama_stack.apis.tools import Tool, ToolGroup, ToolGroupInput, ToolRuntime
 from llama_stack.apis.vector_dbs import VectorDB, VectorDBInput
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.providers.datatypes import Api, ProviderSpec
-from llama_stack.providers.utils.kvstore.config import KVStoreConfig
+from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
 
 LLAMA_STACK_BUILD_CONFIG_VERSION = "2"
 LLAMA_STACK_RUN_CONFIG_VERSION = "2"
@@ -235,6 +235,19 @@ class AuthenticationConfig(BaseModel):
     )
 
 
+class QuotaPeriod(str, Enum):
+    DAY = "day"
+
+
+class QuotaConfig(BaseModel):
+    kvstore: SqliteKVStoreConfig = Field(description="Config for KV store backend (SQLite only for now)")
+    anonymous_max_requests: int = Field(default=100, description="Max requests for unauthenticated clients per period")
+    authenticated_max_requests: int = Field(
+        default=1000, description="Max requests for authenticated clients per period"
+    )
+    period: QuotaPeriod = Field(default=QuotaPeriod.DAY, description="Quota period to set")
+
+
 class ServerConfig(BaseModel):
     port: int = Field(
         default=8321,
@@ -262,6 +275,10 @@ class ServerConfig(BaseModel):
         default=None,
         description="The host the server should listen on",
     )
+    quota: QuotaConfig | None = Field(
+        default=None,
+        description="Per client quota request configuration",
+    )
 
 
 class StackRunConfig(BaseModel):
diff --git a/llama_stack/distribution/server/auth.py b/llama_stack/distribution/server/auth.py
index 83436c51f..67acffe3e 100644
--- a/llama_stack/distribution/server/auth.py
+++ b/llama_stack/distribution/server/auth.py
@@ -113,6 +113,10 @@ class AuthenticationMiddleware:
                     "roles": [token],
                 }
 
+            # Store the client ID in the request scope so that downstream middleware (like QuotaMiddleware)
+            # can identify the requester and enforce per-client rate limits.
+            scope["authenticated_client_id"] = token
+
             # Store attributes in request scope
             scope["user_attributes"] = user_attributes
             scope["principal"] = validation_result.principal
diff --git a/llama_stack/distribution/server/quota.py b/llama_stack/distribution/server/quota.py
new file mode 100644
index 000000000..ddbffae64
--- /dev/null
+++ b/llama_stack/distribution/server/quota.py
@@ -0,0 +1,110 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import time
+from datetime import datetime, timedelta, timezone
+
+from starlette.types import ASGIApp, Receive, Scope, Send
+
+from llama_stack.log import get_logger
+from llama_stack.providers.utils.kvstore.api import KVStore
+from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
+from llama_stack.providers.utils.kvstore.kvstore import kvstore_impl
+
+logger = get_logger(name=__name__, category="quota")
+
+
+class QuotaMiddleware:
+    """
+    ASGI middleware that enforces separate quotas for authenticated and anonymous clients
+    within a configurable time window.
+
+    - For authenticated requests, it reads the client ID from the
+      `Authorization: Bearer <client_id>` header.
+    - For anonymous requests, it falls back to the IP address of the client.
+    Requests are counted in a KV store (e.g., SQLite), and HTTP 429 is returned
+    once a client exceeds its quota.
+    """
+
+    def __init__(
+        self,
+        app: ASGIApp,
+        kv_config: KVStoreConfig,
+        anonymous_max_requests: int,
+        authenticated_max_requests: int,
+        window_seconds: int = 86400,
+    ):
+        self.app = app
+        self.kv_config = kv_config
+        self.kv: KVStore | None = None
+        self.anonymous_max_requests = anonymous_max_requests
+        self.authenticated_max_requests = authenticated_max_requests
+        self.window_seconds = window_seconds
+
+        if isinstance(self.kv_config, SqliteKVStoreConfig):
+            logger.warning(
+                "QuotaMiddleware: Using SQLite backend. Expiry/TTL is not enforced; cleanup is manual. "
+                f"window_seconds={self.window_seconds}"
+            )
+
+    async def _get_kv(self) -> KVStore:
+        if self.kv is None:
+            self.kv = await kvstore_impl(self.kv_config)
+        return self.kv
+
+    async def __call__(self, scope: Scope, receive: Receive, send: Send):
+        if scope["type"] == "http":
+            # pick key & limit based on auth
+            auth_id = scope.get("authenticated_client_id")
+            if auth_id:
+                key_id = auth_id
+                limit = self.authenticated_max_requests
+            else:
+                # fallback to IP
+                client = scope.get("client")
+                key_id = client[0] if client else "anonymous"
+                limit = self.anonymous_max_requests
+
+            current_window = int(time.time() // self.window_seconds)
+            key = f"quota:{key_id}:{current_window}"
+
+            try:
+                kv = await self._get_kv()
+                prev = await kv.get(key) or "0"
+                count = int(prev) + 1
+
+                if int(prev) == 0:
+                    # Set with expiration datetime when it is the first request in the window.
+                    expiration = datetime.now(timezone.utc) + timedelta(seconds=self.window_seconds)
+                    await kv.set(key, str(count), expiration=expiration)
+                else:
+                    await kv.set(key, str(count))
+            except Exception:
+                logger.exception("Failed to access KV store for quota")
+                return await self._send_error(send, 500, "Quota service error")
+
+            if count > limit:
+                logger.warning(
+                    "Quota exceeded for client %s: %d/%d",
+                    key_id,
+                    count,
+                    limit,
+                )
+                return await self._send_error(send, 429, "Quota exceeded")
+
+        return await self.app(scope, receive, send)
+
+    async def _send_error(self, send: Send, status: int, message: str):
+        await send(
+            {
+                "type": "http.response.start",
+                "status": status,
+                "headers": [[b"content-type", b"application/json"]],
+            }
+        )
+        body = json.dumps({"error": {"message": message}}).encode()
+        await send({"type": "http.response.body", "body": body})
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index e25bf0817..52f2b71b0 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -60,6 +60,7 @@ from llama_stack.providers.utils.telemetry.tracing import (
 
 from .auth import AuthenticationMiddleware
 from .endpoints import get_all_api_endpoints
+from .quota import QuotaMiddleware
 
 REPO_ROOT = Path(__file__).parent.parent.parent.parent
 
@@ -434,6 +435,35 @@ def main(args: argparse.Namespace | None = None):
     if config.server.auth:
         logger.info(f"Enabling authentication with provider: {config.server.auth.provider_type.value}")
         app.add_middleware(AuthenticationMiddleware, auth_config=config.server.auth)
+    else:
+        if config.server.quota:
+            quota = config.server.quota
+            logger.warning(
+                "Configured authenticated_max_requests (%d) but no auth is enabled; "
+                "falling back to anonymous_max_requests (%d) for all the requests",
+                quota.authenticated_max_requests,
+                quota.anonymous_max_requests,
+            )
+
+    if config.server.quota:
+        logger.info("Enabling quota middleware for authenticated and anonymous clients")
+
+        quota = config.server.quota
+        anonymous_max_requests = quota.anonymous_max_requests
+        # if auth is disabled, use the anonymous max requests
+        authenticated_max_requests = quota.authenticated_max_requests if config.server.auth else anonymous_max_requests
+
+        kv_config = quota.kvstore
+        window_map = {"day": 86400}
+        window_seconds = window_map[quota.period.value]
+
+        app.add_middleware(
+            QuotaMiddleware,
+            kv_config=kv_config,
+            anonymous_max_requests=anonymous_max_requests,
+            authenticated_max_requests=authenticated_max_requests,
+            window_seconds=window_seconds,
+        )
 
     try:
         impls = asyncio.run(construct_stack(config))
diff --git a/tests/unit/server/test_quota.py b/tests/unit/server/test_quota.py
new file mode 100644
index 000000000..763bf8e94
--- /dev/null
+++ b/tests/unit/server/test_quota.py
@@ -0,0 +1,127 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import pytest
+from fastapi import FastAPI, Request
+from fastapi.testclient import TestClient
+from starlette.middleware.base import BaseHTTPMiddleware
+
+from llama_stack.distribution.datatypes import QuotaConfig, QuotaPeriod
+from llama_stack.distribution.server.quota import QuotaMiddleware
+from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
+
+
+class InjectClientIDMiddleware(BaseHTTPMiddleware):
+    """
+    Middleware that injects 'authenticated_client_id' to mimic AuthenticationMiddleware.
+    """
+
+    def __init__(self, app, client_id="client1"):
+        super().__init__(app)
+        self.client_id = client_id
+
+    async def dispatch(self, request: Request, call_next):
+        request.scope["authenticated_client_id"] = self.client_id
+        return await call_next(request)
+
+
+def build_quota_config(db_path) -> QuotaConfig:
+    return QuotaConfig(
+        kvstore=SqliteKVStoreConfig(db_path=str(db_path)),
+        anonymous_max_requests=1,
+        authenticated_max_requests=2,
+        period=QuotaPeriod.DAY,
+    )
+
+
+@pytest.fixture
+def auth_app(tmp_path, request):
+    """
+    FastAPI app with InjectClientIDMiddleware and QuotaMiddleware for authenticated testing.
+    Each test gets its own DB file.
+    """
+    inner_app = FastAPI()
+
+    @inner_app.get("/test")
+    async def test_endpoint():
+        return {"message": "ok"}
+
+    db_path = tmp_path / f"quota_{request.node.name}.db"
+    quota = build_quota_config(db_path)
+
+    app = InjectClientIDMiddleware(
+        QuotaMiddleware(
+            inner_app,
+            kv_config=quota.kvstore,
+            anonymous_max_requests=quota.anonymous_max_requests,
+            authenticated_max_requests=quota.authenticated_max_requests,
+            window_seconds=86400,
+        ),
+        client_id=f"client_{request.node.name}",
+    )
+    return app
+
+
+def test_authenticated_quota_allows_up_to_limit(auth_app):
+    client = TestClient(auth_app)
+    assert client.get("/test").status_code == 200
+    assert client.get("/test").status_code == 200
+
+
+def test_authenticated_quota_blocks_after_limit(auth_app):
+    client = TestClient(auth_app)
+    client.get("/test")
+    client.get("/test")
+    resp = client.get("/test")
+    assert resp.status_code == 429
+    assert resp.json()["error"]["message"] == "Quota exceeded"
+
+
+def test_anonymous_quota_allows_up_to_limit(tmp_path, request):
+    inner_app = FastAPI()
+
+    @inner_app.get("/test")
+    async def test_endpoint():
+        return {"message": "ok"}
+
+    db_path = tmp_path / f"quota_anon_{request.node.name}.db"
+    quota = build_quota_config(db_path)
+
+    app = QuotaMiddleware(
+        inner_app,
+        kv_config=quota.kvstore,
+        anonymous_max_requests=quota.anonymous_max_requests,
+        authenticated_max_requests=quota.authenticated_max_requests,
+        window_seconds=86400,
+    )
+
+    client = TestClient(app)
+    assert client.get("/test").status_code == 200
+
+
+def test_anonymous_quota_blocks_after_limit(tmp_path, request):
+    inner_app = FastAPI()
+
+    @inner_app.get("/test")
+    async def test_endpoint():
+        return {"message": "ok"}
+
+    db_path = tmp_path / f"quota_anon_{request.node.name}.db"
+    quota = build_quota_config(db_path)
+
+    app = QuotaMiddleware(
+        inner_app,
+        kv_config=quota.kvstore,
+        anonymous_max_requests=quota.anonymous_max_requests,
+        authenticated_max_requests=quota.authenticated_max_requests,
+        window_seconds=86400,
+    )
+
+    client = TestClient(app)
+    client.get("/test")
+    resp = client.get("/test")
+    assert resp.status_code == 429
+    assert resp.json()["error"]["message"] == "Quota exceeded"

From c25acedbcd910c9643269f655b058906ac53a0b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Wed, 21 May 2025 16:23:54 +0200
Subject: [PATCH 4/7] chore: remove k8s auth in favor of k8s jwks endpoint
 (#2216)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Kubernetes since 1.20 exposes a JWKS endpoint that we can use with our
recent oauth2 recent implementation.
The CI test has been kept intact for validation.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 .github/workflows/integration-auth-tests.yml  |  39 ++++-
 docs/source/distributions/configuration.md    |  68 ++++++--
 llama_stack/distribution/datatypes.py         |   4 +-
 llama_stack/distribution/server/auth.py       |   5 +-
 .../distribution/server/auth_providers.py     | 162 +++++-------------
 pyproject.toml                                |   1 -
 requirements.txt                              |   8 -
 tests/unit/server/test_auth.py                | 121 +------------
 uv.lock                                       |  98 +----------
 9 files changed, 147 insertions(+), 359 deletions(-)

diff --git a/.github/workflows/integration-auth-tests.yml b/.github/workflows/integration-auth-tests.yml
index 82a76ad32..994bd1dec 100644
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@@ -23,7 +23,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        auth-provider: [kubernetes]
+        auth-provider: [oauth2_token]
       fail-fast: false # we want to run all tests regardless of failure
 
     steps:
@@ -47,29 +47,53 @@ jobs:
         uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # v0.0.19
 
       - name: Start minikube
-        if: ${{ matrix.auth-provider == 'kubernetes' }}
+        if: ${{ matrix.auth-provider == 'oauth2_token' }}
         run: |
           minikube start
           kubectl get pods -A
 
       - name: Configure Kube Auth
-        if: ${{ matrix.auth-provider == 'kubernetes' }}
+        if: ${{ matrix.auth-provider == 'oauth2_token' }}
         run: |
           kubectl create namespace llama-stack
           kubectl create serviceaccount llama-stack-auth -n llama-stack
           kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --serviceaccount=llama-stack:llama-stack-auth -n llama-stack
           kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
+          cat <<EOF | kubectl apply -f -
+          apiVersion: rbac.authorization.k8s.io/v1
+          kind: ClusterRole
+          metadata:
+            name: allow-anonymous-openid
+          rules:
+          - nonResourceURLs: ["/openid/v1/jwks"]
+            verbs: ["get"]
+          ---
+          apiVersion: rbac.authorization.k8s.io/v1
+          kind: ClusterRoleBinding
+          metadata:
+            name: allow-anonymous-openid
+          roleRef:
+            apiGroup: rbac.authorization.k8s.io
+            kind: ClusterRole
+            name: allow-anonymous-openid
+          subjects:
+          - kind: User
+            name: system:anonymous
+            apiGroup: rbac.authorization.k8s.io
+          EOF
 
       - name: Set Kubernetes Config
-        if: ${{ matrix.auth-provider == 'kubernetes' }}
+        if: ${{ matrix.auth-provider == 'oauth2_token' }}
         run: |
-          echo "KUBERNETES_API_SERVER_URL=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.server}')" >> $GITHUB_ENV
+          echo "KUBERNETES_API_SERVER_URL=$(kubectl get --raw /.well-known/openid-configuration| jq -r .jwks_uri)" >> $GITHUB_ENV
           echo "KUBERNETES_CA_CERT_PATH=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}')" >> $GITHUB_ENV
+          echo "KUBERNETES_ISSUER=$(kubectl get --raw /.well-known/openid-configuration| jq -r .issuer)" >> $GITHUB_ENV
+          echo "KUBERNETES_AUDIENCE=$(kubectl create token default --duration=1h | cut -d. -f2 | base64 -d | jq -r '.aud[0]')" >> $GITHUB_ENV
 
       - name: Set Kube Auth Config and run server
         env:
           INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
-        if: ${{ matrix.auth-provider == 'kubernetes' }}
+        if: ${{ matrix.auth-provider == 'oauth2_token' }}
         run: |
           run_dir=$(mktemp -d)
           cat <<'EOF' > $run_dir/run.yaml
@@ -81,7 +105,8 @@ jobs:
             port: 8321
           EOF
           yq eval '.server.auth = {"provider_type": "${{ matrix.auth-provider }}"}' -i $run_dir/run.yaml
-          yq eval '.server.auth.config = {"api_server_url": "${{ env.KUBERNETES_API_SERVER_URL }}", "ca_cert_path": "${{ env.KUBERNETES_CA_CERT_PATH }}"}' -i $run_dir/run.yaml
+          yq eval '.server.auth.config = {"tls_cafile": "${{ env.KUBERNETES_CA_CERT_PATH }}", "issuer": "${{ env.KUBERNETES_ISSUER }}", "audience": "${{ env.KUBERNETES_AUDIENCE }}"}' -i $run_dir/run.yaml
+          yq eval '.server.auth.config.jwks = {"uri": "${{ env.KUBERNETES_API_SERVER_URL }}"}' -i $run_dir/run.yaml
           cat $run_dir/run.yaml
 
           source .venv/bin/activate
diff --git a/docs/source/distributions/configuration.md b/docs/source/distributions/configuration.md
index 7a42f503a..77b52a621 100644
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@@ -118,11 +118,6 @@ server:
   port: 8321  # Port to listen on (default: 8321)
   tls_certfile: "/path/to/cert.pem"  # Optional: Path to TLS certificate for HTTPS
   tls_keyfile: "/path/to/key.pem"    # Optional: Path to TLS key for HTTPS
-  auth:                              # Optional: Authentication configuration
-    provider_type: "kubernetes"      # Type of auth provider
-    config:                          # Provider-specific configuration
-      api_server_url: "https://kubernetes.default.svc"
-      ca_cert_path: "/path/to/ca.crt" # Optional: Path to CA certificate
 ```
 
 ### Authentication Configuration
@@ -135,7 +130,7 @@ Authorization: Bearer <token>
 
 The server supports multiple authentication providers:
 
-#### Kubernetes Provider
+#### OAuth 2.0/OpenID Connect Provider with Kubernetes
 
 The Kubernetes cluster must be configured to use a service account for authentication.
 
@@ -146,14 +141,67 @@ kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --se
 kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
 ```
 
-Validates tokens against the Kubernetes API server:
+Make sure the `kube-apiserver` runs with `--anonymous-auth=true` to allow unauthenticated requests
+and that the correct RoleBinding is created to allow the service account to access the necessary
+resources. If that is not the case, you can create a RoleBinding for the service account to access
+the necessary resources:
+
+```yaml
+# allow-anonymous-openid.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: allow-anonymous-openid
+rules:
+- nonResourceURLs: ["/openid/v1/jwks"]
+  verbs: ["get"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: allow-anonymous-openid
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: allow-anonymous-openid
+subjects:
+- kind: User
+  name: system:anonymous
+  apiGroup: rbac.authorization.k8s.io
+```
+
+And then apply the configuration:
+```bash
+kubectl apply -f allow-anonymous-openid.yaml
+```
+
+Validates tokens against the Kubernetes API server through the OIDC provider:
 ```yaml
 server:
   auth:
-    provider_type: "kubernetes"
+    provider_type: "oauth2_token"
     config:
-      api_server_url: "https://kubernetes.default.svc"  # URL of the Kubernetes API server
-      ca_cert_path: "/path/to/ca.crt"                   # Optional: Path to CA certificate
+      jwks:
+        uri: "https://kubernetes.default.svc"
+        cache_ttl: 3600
+      tls_cafile: "/path/to/ca.crt"
+      issuer: "https://kubernetes.default.svc"
+      audience: "https://kubernetes.default.svc"
+```
+
+To find your cluster's audience, run:
+```bash
+kubectl create token default --duration=1h | cut -d. -f2 | base64 -d | jq .aud
+```
+
+For the issuer, you can use the OIDC provider's URL:
+```bash
+kubectl get --raw /.well-known/openid-configuration| jq .issuer
+```
+
+For the tls_cafile, you can use the CA certificate of the OIDC provider:
+```bash
+kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}'
 ```
 
 The provider extracts user information from the JWT token:
diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index ca3664828..eb790ad93 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -220,14 +220,14 @@ class LoggingConfig(BaseModel):
 class AuthProviderType(str, Enum):
     """Supported authentication provider types."""
 
-    KUBERNETES = "kubernetes"
+    OAUTH2_TOKEN = "oauth2_token"
     CUSTOM = "custom"
 
 
 class AuthenticationConfig(BaseModel):
     provider_type: AuthProviderType = Field(
         ...,
-        description="Type of authentication provider (e.g., 'kubernetes', 'custom')",
+        description="Type of authentication provider",
     )
     config: dict[str, Any] = Field(
         ...,
diff --git a/llama_stack/distribution/server/auth.py b/llama_stack/distribution/server/auth.py
index 67acffe3e..fb26b49a7 100644
--- a/llama_stack/distribution/server/auth.py
+++ b/llama_stack/distribution/server/auth.py
@@ -8,7 +8,8 @@ import json
 
 import httpx
 
-from llama_stack.distribution.server.auth_providers import AuthProviderConfig, create_auth_provider
+from llama_stack.distribution.datatypes import AuthenticationConfig
+from llama_stack.distribution.server.auth_providers import create_auth_provider
 from llama_stack.log import get_logger
 
 logger = get_logger(name=__name__, category="auth")
@@ -77,7 +78,7 @@ class AuthenticationMiddleware:
     access resources that don't have access_attributes defined.
     """
 
-    def __init__(self, app, auth_config: AuthProviderConfig):
+    def __init__(self, app, auth_config: AuthenticationConfig):
         self.app = app
         self.auth_provider = create_auth_provider(auth_config)
 
diff --git a/llama_stack/distribution/server/auth_providers.py b/llama_stack/distribution/server/auth_providers.py
index baab75eca..39f258c3b 100644
--- a/llama_stack/distribution/server/auth_providers.py
+++ b/llama_stack/distribution/server/auth_providers.py
@@ -4,13 +4,11 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import json
 import ssl
 import time
 from abc import ABC, abstractmethod
 from asyncio import Lock
-from enum import Enum
-from typing import Any
+from pathlib import Path
 from urllib.parse import parse_qs
 
 import httpx
@@ -18,7 +16,7 @@ from jose import jwt
 from pydantic import BaseModel, Field, field_validator, model_validator
 from typing_extensions import Self
 
-from llama_stack.distribution.datatypes import AccessAttributes
+from llama_stack.distribution.datatypes import AccessAttributes, AuthenticationConfig, AuthProviderType
 from llama_stack.log import get_logger
 
 logger = get_logger(name=__name__, category="auth")
@@ -76,21 +74,6 @@ class AuthRequest(BaseModel):
     request: AuthRequestContext = Field(description="Context information about the request being authenticated")
 
 
-class AuthProviderType(str, Enum):
-    """Supported authentication provider types."""
-
-    KUBERNETES = "kubernetes"
-    CUSTOM = "custom"
-    OAUTH2_TOKEN = "oauth2_token"
-
-
-class AuthProviderConfig(BaseModel):
-    """Base configuration for authentication providers."""
-
-    provider_type: AuthProviderType = Field(..., description="Type of authentication provider")
-    config: dict[str, Any] = Field(..., description="Provider-specific configuration")
-
-
 class AuthProvider(ABC):
     """Abstract base class for authentication providers."""
 
@@ -105,83 +88,6 @@ class AuthProvider(ABC):
         pass
 
 
-class KubernetesAuthProviderConfig(BaseModel):
-    api_server_url: str
-    ca_cert_path: str | None = None
-
-
-class KubernetesAuthProvider(AuthProvider):
-    """Kubernetes authentication provider that validates tokens against the Kubernetes API server."""
-
-    def __init__(self, config: KubernetesAuthProviderConfig):
-        self.config = config
-        self._client = None
-
-    async def _get_client(self):
-        """Get or create a Kubernetes client."""
-        if self._client is None:
-            # kubernetes-client has not async support, see:
-            # https://github.com/kubernetes-client/python/issues/323
-            from kubernetes import client
-            from kubernetes.client import ApiClient
-
-            # Configure the client
-            configuration = client.Configuration()
-            configuration.host = self.config.api_server_url
-            if self.config.ca_cert_path:
-                configuration.ssl_ca_cert = self.config.ca_cert_path
-            configuration.verify_ssl = bool(self.config.ca_cert_path)
-
-            # Create API client
-            self._client = ApiClient(configuration)
-        return self._client
-
-    async def validate_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
-        """Validate a Kubernetes token and return access attributes."""
-        try:
-            client = await self._get_client()
-
-            # Set the token in the client
-            client.set_default_header("Authorization", f"Bearer {token}")
-
-            # Make a request to validate the token
-            # We use the /api endpoint which requires authentication
-            from kubernetes.client import CoreV1Api
-
-            api = CoreV1Api(client)
-            api.get_api_resources(_request_timeout=3.0)  # Set timeout for this specific request
-
-            # If we get here, the token is valid
-            # Extract user info from the token claims
-            import base64
-
-            # Decode the token (without verification since we've already validated it)
-            token_parts = token.split(".")
-            payload = json.loads(base64.b64decode(token_parts[1] + "=" * (-len(token_parts[1]) % 4)))
-
-            # Extract user information from the token
-            username = payload.get("sub", "")
-            groups = payload.get("groups", [])
-
-            return TokenValidationResult(
-                principal=username,
-                access_attributes=AccessAttributes(
-                    roles=[username],  # Use username as a role
-                    teams=groups,  # Use Kubernetes groups as teams
-                ),
-            )
-
-        except Exception as e:
-            logger.exception("Failed to validate Kubernetes token")
-            raise ValueError("Invalid or expired token") from e
-
-    async def close(self):
-        """Close the HTTP client."""
-        if self._client:
-            self._client.close()
-            self._client = None
-
-
 def get_attributes_from_claims(claims: dict[str, str], mapping: dict[str, str]) -> AccessAttributes:
     attributes = AccessAttributes()
     for claim_key, attribute_key in mapping.items():
@@ -212,11 +118,13 @@ class OAuth2IntrospectionConfig(BaseModel):
     client_id: str
     client_secret: str
     send_secret_in_body: bool = False
-    tls_cafile: str | None = None
 
 
 class OAuth2TokenAuthProviderConfig(BaseModel):
     audience: str = "llama-stack"
+    verify_tls: bool = True
+    tls_cafile: Path | None = None
+    issuer: str | None = Field(default=None, description="The OIDC issuer URL.")
     claims_mapping: dict[str, str] = Field(
         default_factory=lambda: {
             "sub": "roles",
@@ -265,16 +173,14 @@ class OAuth2TokenAuthProvider(AuthProvider):
 
     async def validate_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
         if self.config.jwks:
-            return await self.validate_jwt_token(token, self.config.jwks, scope)
+            return await self.validate_jwt_token(token, scope)
         if self.config.introspection:
-            return await self.introspect_token(token, self.config.introspection, scope)
+            return await self.introspect_token(token, scope)
         raise ValueError("One of jwks or introspection must be configured")
 
-    async def validate_jwt_token(
-        self, token: str, config: OAuth2JWKSConfig, scope: dict | None = None
-    ) -> TokenValidationResult:
+    async def validate_jwt_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
         """Validate a token using the JWT token."""
-        await self._refresh_jwks(config)
+        await self._refresh_jwks()
 
         try:
             header = jwt.get_unverified_header(token)
@@ -288,7 +194,7 @@ class OAuth2TokenAuthProvider(AuthProvider):
                 key_data,
                 algorithms=[algorithm],
                 audience=self.config.audience,
-                options={"verify_exp": True},
+                issuer=self.config.issuer,
             )
         except Exception as exc:
             raise ValueError(f"Invalid JWT token: {token}") from exc
@@ -302,26 +208,27 @@ class OAuth2TokenAuthProvider(AuthProvider):
             access_attributes=access_attributes,
         )
 
-    async def introspect_token(
-        self, token: str, config: OAuth2IntrospectionConfig, scope: dict | None = None
-    ) -> TokenValidationResult:
+    async def introspect_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
         """Validate a token using token introspection as defined by RFC 7662."""
         form = {
             "token": token,
         }
-        if config.send_secret_in_body:
-            form["client_id"] = config.client_id
-            form["client_secret"] = config.client_secret
+        if self.config.introspection is None:
+            raise ValueError("Introspection is not configured")
+
+        if self.config.introspection.send_secret_in_body:
+            form["client_id"] = self.config.introspection.client_id
+            form["client_secret"] = self.config.introspection.client_secret
             auth = None
         else:
-            auth = (config.client_id, config.client_secret)
+            auth = (self.config.introspection.client_id, self.config.introspection.client_secret)
         ssl_ctxt = None
-        if config.tls_cafile:
-            ssl_ctxt = ssl.create_default_context(cafile=config.tls_cafile)
+        if self.config.tls_cafile:
+            ssl_ctxt = ssl.create_default_context(cafile=self.config.tls_cafile.as_posix())
         try:
             async with httpx.AsyncClient(verify=ssl_ctxt) as client:
                 response = await client.post(
-                    config.url,
+                    self.config.introspection.url,
                     data=form,
                     auth=auth,
                     timeout=10.0,  # Add a reasonable timeout
@@ -352,11 +259,24 @@ class OAuth2TokenAuthProvider(AuthProvider):
     async def close(self):
         pass
 
-    async def _refresh_jwks(self, config: OAuth2JWKSConfig) -> None:
+    async def _refresh_jwks(self) -> None:
+        """
+        Refresh the JWKS cache.
+
+        This is a simple cache that expires after a certain amount of time (defined by `cache_ttl`).
+        If the cache is expired, we refresh the JWKS from the JWKS URI.
+
+        Notes: for Kubernetes which doesn't fully implement the OIDC protocol:
+            * It doesn't have user authentication flows
+            * It doesn't have refresh tokens
+        """
         async with self._jwks_lock:
-            if time.time() - self._jwks_at > config.cache_ttl:
-                async with httpx.AsyncClient() as client:
-                    res = await client.get(config.uri, timeout=5)
+            if self.config.jwks is None:
+                raise ValueError("JWKS is not configured")
+            if time.time() - self._jwks_at > self.config.jwks.cache_ttl:
+                verify = self.config.tls_cafile.as_posix() if self.config.tls_cafile else self.config.verify_tls
+                async with httpx.AsyncClient(verify=verify) as client:
+                    res = await client.get(self.config.jwks.uri, timeout=5)
                     res.raise_for_status()
                     jwks_data = res.json()["keys"]
                     updated = {}
@@ -443,13 +363,11 @@ class CustomAuthProvider(AuthProvider):
             self._client = None
 
 
-def create_auth_provider(config: AuthProviderConfig) -> AuthProvider:
+def create_auth_provider(config: AuthenticationConfig) -> AuthProvider:
     """Factory function to create the appropriate auth provider."""
     provider_type = config.provider_type.lower()
 
-    if provider_type == "kubernetes":
-        return KubernetesAuthProvider(KubernetesAuthProviderConfig.model_validate(config.config))
-    elif provider_type == "custom":
+    if provider_type == "custom":
         return CustomAuthProvider(CustomAuthProviderConfig.model_validate(config.config))
     elif provider_type == "oauth2_token":
         return OAuth2TokenAuthProvider(OAuth2TokenAuthProviderConfig.model_validate(config.config))
diff --git a/pyproject.toml b/pyproject.toml
index a41830e64..8b922bafb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,7 +40,6 @@ dependencies = [
     "tiktoken",
     "pillow",
     "h11>=0.16.0",
-    "kubernetes",
 ]
 
 [project.optional-dependencies]
diff --git a/requirements.txt b/requirements.txt
index 6dfcc1024..2fe72c803 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,19 +4,16 @@ annotated-types==0.7.0
 anyio==4.8.0
 attrs==25.1.0
 blobfile==3.0.0
-cachetools==5.5.2
 certifi==2025.1.31
 charset-normalizer==3.4.1
 click==8.1.8
 colorama==0.4.6 ; sys_platform == 'win32'
 distro==1.9.0
-durationpy==0.9
 ecdsa==0.19.1
 exceptiongroup==1.2.2 ; python_full_version < '3.11'
 filelock==3.17.0
 fire==0.7.0
 fsspec==2024.12.0
-google-auth==2.38.0
 h11==0.16.0
 httpcore==1.0.9
 httpx==0.28.1
@@ -26,14 +23,12 @@ jinja2==3.1.6
 jiter==0.8.2
 jsonschema==4.23.0
 jsonschema-specifications==2024.10.1
-kubernetes==32.0.1
 llama-stack-client==0.2.7
 lxml==5.3.1
 markdown-it-py==3.0.0
 markupsafe==3.0.2
 mdurl==0.1.2
 numpy==2.2.3
-oauthlib==3.2.2
 openai==1.71.0
 packaging==24.2
 pandas==2.2.3
@@ -41,7 +36,6 @@ pillow==11.1.0
 prompt-toolkit==3.0.50
 pyaml==25.1.0
 pyasn1==0.4.8
-pyasn1-modules==0.4.1
 pycryptodomex==3.21.0
 pydantic==2.10.6
 pydantic-core==2.27.2
@@ -54,7 +48,6 @@ pyyaml==6.0.2
 referencing==0.36.2
 regex==2024.11.6
 requests==2.32.3
-requests-oauthlib==2.0.0
 rich==13.9.4
 rpds-py==0.22.3
 rsa==4.9
@@ -68,4 +61,3 @@ typing-extensions==4.12.2
 tzdata==2025.1
 urllib3==2.3.0
 wcwidth==0.2.13
-websocket-client==1.8.0
diff --git a/tests/unit/server/test_auth.py b/tests/unit/server/test_auth.py
index 56458c0e7..94c486f18 100644
--- a/tests/unit/server/test_auth.py
+++ b/tests/unit/server/test_auth.py
@@ -11,12 +11,10 @@ import pytest
 from fastapi import FastAPI
 from fastapi.testclient import TestClient
 
-from llama_stack.distribution.datatypes import AccessAttributes
+from llama_stack.distribution.datatypes import AuthenticationConfig
 from llama_stack.distribution.server.auth import AuthenticationMiddleware
 from llama_stack.distribution.server.auth_providers import (
-    AuthProviderConfig,
     AuthProviderType,
-    TokenValidationResult,
     get_attributes_from_claims,
 )
 
@@ -62,7 +60,7 @@ def invalid_token():
 @pytest.fixture
 def http_app(mock_auth_endpoint):
     app = FastAPI()
-    auth_config = AuthProviderConfig(
+    auth_config = AuthenticationConfig(
         provider_type=AuthProviderType.CUSTOM,
         config={"endpoint": mock_auth_endpoint},
     )
@@ -78,7 +76,7 @@ def http_app(mock_auth_endpoint):
 @pytest.fixture
 def k8s_app():
     app = FastAPI()
-    auth_config = AuthProviderConfig(
+    auth_config = AuthenticationConfig(
         provider_type=AuthProviderType.KUBERNETES,
         config={"api_server_url": "https://kubernetes.default.svc"},
     )
@@ -118,7 +116,7 @@ def mock_scope():
 @pytest.fixture
 def mock_http_middleware(mock_auth_endpoint):
     mock_app = AsyncMock()
-    auth_config = AuthProviderConfig(
+    auth_config = AuthenticationConfig(
         provider_type=AuthProviderType.CUSTOM,
         config={"endpoint": mock_auth_endpoint},
     )
@@ -128,7 +126,7 @@ def mock_http_middleware(mock_auth_endpoint):
 @pytest.fixture
 def mock_k8s_middleware():
     mock_app = AsyncMock()
-    auth_config = AuthProviderConfig(
+    auth_config = AuthenticationConfig(
         provider_type=AuthProviderType.KUBERNETES,
         config={"api_server_url": "https://kubernetes.default.svc"},
     )
@@ -284,116 +282,13 @@ async def test_http_middleware_no_attributes(mock_http_middleware, mock_scope):
         assert attributes["roles"] == ["test.jwt.token"]
 
 
-# Kubernetes Tests
-def test_missing_auth_header_k8s(k8s_client):
-    response = k8s_client.get("/test")
-    assert response.status_code == 401
-    assert "Missing or invalid Authorization header" in response.json()["error"]["message"]
-
-
-def test_invalid_auth_header_format_k8s(k8s_client):
-    response = k8s_client.get("/test", headers={"Authorization": "InvalidFormat token123"})
-    assert response.status_code == 401
-    assert "Missing or invalid Authorization header" in response.json()["error"]["message"]
-
-
-@patch("kubernetes.client.ApiClient")
-def test_valid_k8s_authentication(mock_api_client, k8s_client, valid_token):
-    # Mock the Kubernetes client
-    mock_client = AsyncMock()
-    mock_api_client.return_value = mock_client
-
-    # Mock successful token validation
-    mock_client.set_default_header = AsyncMock()
-
-    # Mock the token validation to return valid access attributes
-    with patch("llama_stack.distribution.server.auth_providers.KubernetesAuthProvider.validate_token") as mock_validate:
-        mock_validate.return_value = TokenValidationResult(
-            principal="test-principal",
-            access_attributes=AccessAttributes(
-                roles=["admin"], teams=["ml-team"], projects=["llama-3"], namespaces=["research"]
-            ),
-        )
-        response = k8s_client.get("/test", headers={"Authorization": f"Bearer {valid_token}"})
-        assert response.status_code == 200
-        assert response.json() == {"message": "Authentication successful"}
-
-
-@patch("kubernetes.client.ApiClient")
-def test_invalid_k8s_authentication(mock_api_client, k8s_client, invalid_token):
-    # Mock the Kubernetes client
-    mock_client = AsyncMock()
-    mock_api_client.return_value = mock_client
-
-    # Mock failed token validation by raising an exception
-    with patch("llama_stack.distribution.server.auth_providers.KubernetesAuthProvider.validate_token") as mock_validate:
-        mock_validate.side_effect = ValueError("Invalid or expired token")
-        response = k8s_client.get("/test", headers={"Authorization": f"Bearer {invalid_token}"})
-        assert response.status_code == 401
-        assert "Invalid or expired token" in response.json()["error"]["message"]
-
-
-@pytest.mark.asyncio
-async def test_k8s_middleware_with_access_attributes(mock_k8s_middleware, mock_scope):
-    middleware, mock_app = mock_k8s_middleware
-    mock_receive = AsyncMock()
-    mock_send = AsyncMock()
-
-    with patch("kubernetes.client.ApiClient") as mock_api_client:
-        mock_client = AsyncMock()
-        mock_api_client.return_value = mock_client
-
-        # Mock successful token validation
-        mock_client.set_default_header = AsyncMock()
-
-        # Mock token payload with access attributes
-        mock_token_parts = ["header", "eyJzdWIiOiJhZG1pbiIsImdyb3VwcyI6WyJtbC10ZWFtIl19", "signature"]
-        mock_scope["headers"][1] = (b"authorization", f"Bearer {'.'.join(mock_token_parts)}".encode())
-
-        await middleware(mock_scope, mock_receive, mock_send)
-
-        assert "user_attributes" in mock_scope
-        assert mock_scope["user_attributes"]["roles"] == ["admin"]
-        assert mock_scope["user_attributes"]["teams"] == ["ml-team"]
-
-        mock_app.assert_called_once_with(mock_scope, mock_receive, mock_send)
-
-
-@pytest.mark.asyncio
-async def test_k8s_middleware_no_attributes(mock_k8s_middleware, mock_scope):
-    """Test middleware behavior with no access attributes"""
-    middleware, mock_app = mock_k8s_middleware
-    mock_receive = AsyncMock()
-    mock_send = AsyncMock()
-
-    with patch("kubernetes.client.ApiClient") as mock_api_client:
-        mock_client = AsyncMock()
-        mock_api_client.return_value = mock_client
-
-        # Mock successful token validation
-        mock_client.set_default_header = AsyncMock()
-
-        # Mock token payload without access attributes
-        mock_token_parts = ["header", "eyJzdWIiOiJhZG1pbiJ9", "signature"]
-        mock_scope["headers"][1] = (b"authorization", f"Bearer {'.'.join(mock_token_parts)}".encode())
-
-        await middleware(mock_scope, mock_receive, mock_send)
-
-        assert "user_attributes" in mock_scope
-        attributes = mock_scope["user_attributes"]
-        assert "roles" in attributes
-        assert attributes["roles"] == ["admin"]
-
-        mock_app.assert_called_once_with(mock_scope, mock_receive, mock_send)
-
-
 # oauth2 token provider tests
 
 
 @pytest.fixture
 def oauth2_app():
     app = FastAPI()
-    auth_config = AuthProviderConfig(
+    auth_config = AuthenticationConfig(
         provider_type=AuthProviderType.OAUTH2_TOKEN,
         config={
             "jwks": {
@@ -530,7 +425,7 @@ def mock_introspection_endpoint():
 @pytest.fixture
 def introspection_app(mock_introspection_endpoint):
     app = FastAPI()
-    auth_config = AuthProviderConfig(
+    auth_config = AuthenticationConfig(
         provider_type=AuthProviderType.OAUTH2_TOKEN,
         config={
             "jwks": None,
@@ -549,7 +444,7 @@ def introspection_app(mock_introspection_endpoint):
 @pytest.fixture
 def introspection_app_with_custom_mapping(mock_introspection_endpoint):
     app = FastAPI()
-    auth_config = AuthProviderConfig(
+    auth_config = AuthenticationConfig(
         provider_type=AuthProviderType.OAUTH2_TOKEN,
         config={
             "jwks": None,
diff --git a/uv.lock b/uv.lock
index c30e2c4c1..a987ddc9e 100644
--- a/uv.lock
+++ b/uv.lock
@@ -676,15 +676,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408 },
 ]
 
-[[package]]
-name = "durationpy"
-version = "0.9"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/31/e9/f49c4e7fccb77fa5c43c2480e09a857a78b41e7331a75e128ed5df45c56b/durationpy-0.9.tar.gz", hash = "sha256:fd3feb0a69a0057d582ef643c355c40d2fa1c942191f914d12203b1a01ac722a", size = 3186 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/4c/a3/ac312faeceffd2d8f86bc6dcb5c401188ba5a01bc88e69bed97578a0dfcd/durationpy-0.9-py3-none-any.whl", hash = "sha256:e65359a7af5cedad07fb77a2dd3f390f8eb0b74cb845589fa6c057086834dd38", size = 3461 },
-]
-
 [[package]]
 name = "ecdsa"
 version = "0.19.1"
@@ -863,20 +854,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1d/9a/4114a9057db2f1462d5c8f8390ab7383925fe1ac012eaa42402ad65c2963/GitPython-3.1.44-py3-none-any.whl", hash = "sha256:9e0e10cda9bed1ee64bc9a6de50e7e38a9c9943241cd7f585f6df3ed28011110", size = 207599 },
 ]
 
-[[package]]
-name = "google-auth"
-version = "2.38.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "cachetools" },
-    { name = "pyasn1-modules" },
-    { name = "rsa" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/c6/eb/d504ba1daf190af6b204a9d4714d457462b486043744901a6eeea711f913/google_auth-2.38.0.tar.gz", hash = "sha256:8285113607d3b80a3f1543b75962447ba8a09fe85783432a784fdeef6ac094c4", size = 270866 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9d/47/603554949a37bca5b7f894d51896a9c534b9eab808e2520a748e081669d0/google_auth-2.38.0-py2.py3-none-any.whl", hash = "sha256:e7dae6694313f434a2727bf2906f27ad259bae090d7aa896590d86feec3d9d4a", size = 210770 },
-]
-
 [[package]]
 name = "googleapis-common-protos"
 version = "1.67.0"
@@ -1324,28 +1301,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c9/fb/108ecd1fe961941959ad0ee4e12ee7b8b1477247f30b1fdfd83ceaf017f0/jupyter_core-5.7.2-py3-none-any.whl", hash = "sha256:4f7315d2f6b4bcf2e3e7cb6e46772eba760ae459cd1f59d29eb57b0a01bd7409", size = 28965 },
 ]
 
-[[package]]
-name = "kubernetes"
-version = "32.0.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "certifi" },
-    { name = "durationpy" },
-    { name = "google-auth" },
-    { name = "oauthlib" },
-    { name = "python-dateutil" },
-    { name = "pyyaml" },
-    { name = "requests" },
-    { name = "requests-oauthlib" },
-    { name = "six" },
-    { name = "urllib3" },
-    { name = "websocket-client" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b7/e8/0598f0e8b4af37cd9b10d8b87386cf3173cb8045d834ab5f6ec347a758b3/kubernetes-32.0.1.tar.gz", hash = "sha256:42f43d49abd437ada79a79a16bd48a604d3471a117a8347e87db693f2ba0ba28", size = 946691 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/08/10/9f8af3e6f569685ce3af7faab51c8dd9d93b9c38eba339ca31c746119447/kubernetes-32.0.1-py2.py3-none-any.whl", hash = "sha256:35282ab8493b938b08ab5526c7ce66588232df00ef5e1dbe88a419107dc10998", size = 1988070 },
-]
-
 [[package]]
 name = "levenshtein"
 version = "0.27.1"
@@ -1441,7 +1396,6 @@ dependencies = [
     { name = "huggingface-hub" },
     { name = "jinja2" },
     { name = "jsonschema" },
-    { name = "kubernetes" },
     { name = "llama-stack-client" },
     { name = "openai" },
     { name = "pillow" },
@@ -1546,7 +1500,6 @@ requires-dist = [
     { name = "jinja2", specifier = ">=3.1.6" },
     { name = "jinja2", marker = "extra == 'codegen'", specifier = ">=3.1.6" },
     { name = "jsonschema" },
-    { name = "kubernetes" },
     { name = "llama-stack-client", specifier = ">=0.2.7" },
     { name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.7" },
     { name = "mcp", marker = "extra == 'test'" },
@@ -1624,9 +1577,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/cd/6b/31c07396c5b3010668e4eb38061a96ffacb47ec4b14d8aeb64c13856c485/llama_stack_client-0.2.7.tar.gz", hash = "sha256:11aee11fdd5e0e8caad07c0cce9c4d88640938844372e7e3453a91ea0757fcb3", size = 259273, upload-time = "2025-05-16T20:31:39.221Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/cd/6b/31c07396c5b3010668e4eb38061a96ffacb47ec4b14d8aeb64c13856c485/llama_stack_client-0.2.7.tar.gz", hash = "sha256:11aee11fdd5e0e8caad07c0cce9c4d88640938844372e7e3453a91ea0757fcb3", size = 259273 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ac/69/6a5f4683afe355500df4376fdcbfb2fc1e6a0c3bcea5ff8f6114773a9acf/llama_stack_client-0.2.7-py3-none-any.whl", hash = "sha256:78b3f2abdb1770c7b1270a9c0ef58402a988401c564d2e6c83588779ac6fc38d", size = 292727, upload-time = "2025-05-16T20:31:37.587Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/69/6a5f4683afe355500df4376fdcbfb2fc1e6a0c3bcea5ff8f6114773a9acf/llama_stack_client-0.2.7-py3-none-any.whl", hash = "sha256:78b3f2abdb1770c7b1270a9c0ef58402a988401c564d2e6c83588779ac6fc38d", size = 292727 },
 ]
 
 [[package]]
@@ -2087,15 +2040,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/17/7f/d322a4125405920401450118dbdc52e0384026bd669939484670ce8b2ab9/numpy-2.2.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:783145835458e60fa97afac25d511d00a1eca94d4a8f3ace9fe2043003c678e4", size = 12839607 },
 ]
 
-[[package]]
-name = "oauthlib"
-version = "3.2.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/6d/fa/fbf4001037904031639e6bfbfc02badfc7e12f137a8afa254df6c4c8a670/oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918", size = 177352 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/7e/80/cab10959dc1faead58dc8384a781dfbf93cb4d33d50988f7a69f1b7c9bbe/oauthlib-3.2.2-py3-none-any.whl", hash = "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca", size = 151688 },
-]
-
 [[package]]
 name = "openai"
 version = "1.71.0"
@@ -2608,18 +2552,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/62/1e/a94a8d635fa3ce4cfc7f506003548d0a2447ae76fd5ca53932970fe3053f/pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d", size = 77145 },
 ]
 
-[[package]]
-name = "pyasn1-modules"
-version = "0.4.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "pyasn1" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/1d/67/6afbf0d507f73c32d21084a79946bfcfca5fbc62a72057e9c23797a737c9/pyasn1_modules-0.4.1.tar.gz", hash = "sha256:c28e2dbf9c06ad61c71a075c7e0f9fd0f1b0bb2d2ad4377f240d33ac2ab60a7c", size = 310028 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/77/89/bc88a6711935ba795a679ea6ebee07e128050d6382eaa35a0a47c8032bdc/pyasn1_modules-0.4.1-py3-none-any.whl", hash = "sha256:49bfa96b45a292b711e986f222502c1c9a5e1f4e568fc30e2574a6c7d07838fd", size = 181537 },
-]
-
 [[package]]
 name = "pycparser"
 version = "2.22"
@@ -2875,9 +2807,9 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "pytest" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ac/82/4c9ecabab13363e72d880f2fb504c5f750433b2b6f16e99f4ec21ada284c/pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a", size = 17973, upload-time = "2025-05-05T19:44:34.99Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/ac/82/4c9ecabab13363e72d880f2fb504c5f750433b2b6f16e99f4ec21ada284c/pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a", size = 17973 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/fa/b6/3127540ecdf1464a00e5a01ee60a1b09175f6913f0644ac748494d9c4b21/pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2", size = 14382, upload-time = "2025-05-05T19:44:33.502Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/b6/3127540ecdf1464a00e5a01ee60a1b09175f6913f0644ac748494d9c4b21/pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2", size = 14382 },
 ]
 
 [[package]]
@@ -3256,19 +3188,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 },
 ]
 
-[[package]]
-name = "requests-oauthlib"
-version = "2.0.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "oauthlib" },
-    { name = "requests" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/42/f2/05f29bc3913aea15eb670be136045bf5c5bbf4b99ecb839da9b422bb2c85/requests-oauthlib-2.0.0.tar.gz", hash = "sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9", size = 55650 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36", size = 24179 },
-]
-
 [[package]]
 name = "rich"
 version = "13.9.4"
@@ -4323,15 +4242,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166 },
 ]
 
-[[package]]
-name = "websocket-client"
-version = "1.8.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e6/30/fba0d96b4b5fbf5948ed3f4681f7da2f9f64512e1d303f94b4cc174c24a5/websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da", size = 54648 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/5a/84/44687a29792a70e111c5c477230a72c4b957d88d16141199bf9acb7537a3/websocket_client-1.8.0-py3-none-any.whl", hash = "sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526", size = 58826 },
-]
-
 [[package]]
 name = "websockets"
 version = "15.0"

From 1862de4be51fa3697d54525c65aebe9edc6c8514 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Wed, 21 May 2025 17:30:23 +0200
Subject: [PATCH 5/7] chore: clarify cache_ttl to be key_recheck_period (#2220)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

The cache_ttl config value is not in fact tied to the lifetime of any of
the keys, it represents the time interval between for our key cache
refresher.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 docs/source/distributions/configuration.md        | 2 +-
 llama_stack/distribution/server/auth_providers.py | 6 +++---
 tests/unit/server/test_auth.py                    | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/source/distributions/configuration.md b/docs/source/distributions/configuration.md
index 77b52a621..de99b6576 100644
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@@ -183,7 +183,7 @@ server:
     config:
       jwks:
         uri: "https://kubernetes.default.svc"
-        cache_ttl: 3600
+        key_recheck_period: 3600
       tls_cafile: "/path/to/ca.crt"
       issuer: "https://kubernetes.default.svc"
       audience: "https://kubernetes.default.svc"
diff --git a/llama_stack/distribution/server/auth_providers.py b/llama_stack/distribution/server/auth_providers.py
index 39f258c3b..723a65b77 100644
--- a/llama_stack/distribution/server/auth_providers.py
+++ b/llama_stack/distribution/server/auth_providers.py
@@ -110,7 +110,7 @@ def get_attributes_from_claims(claims: dict[str, str], mapping: dict[str, str])
 class OAuth2JWKSConfig(BaseModel):
     # The JWKS URI for collecting public keys
     uri: str
-    cache_ttl: int = 3600
+    key_recheck_period: int = Field(default=3600, description="The period to recheck the JWKS URI for key updates")
 
 
 class OAuth2IntrospectionConfig(BaseModel):
@@ -263,7 +263,7 @@ class OAuth2TokenAuthProvider(AuthProvider):
         """
         Refresh the JWKS cache.
 
-        This is a simple cache that expires after a certain amount of time (defined by `cache_ttl`).
+        This is a simple cache that expires after a certain amount of time (defined by `key_recheck_period`).
         If the cache is expired, we refresh the JWKS from the JWKS URI.
 
         Notes: for Kubernetes which doesn't fully implement the OIDC protocol:
@@ -273,7 +273,7 @@ class OAuth2TokenAuthProvider(AuthProvider):
         async with self._jwks_lock:
             if self.config.jwks is None:
                 raise ValueError("JWKS is not configured")
-            if time.time() - self._jwks_at > self.config.jwks.cache_ttl:
+            if time.time() - self._jwks_at > self.config.jwks.key_recheck_period:
                 verify = self.config.tls_cafile.as_posix() if self.config.tls_cafile else self.config.verify_tls
                 async with httpx.AsyncClient(verify=verify) as client:
                     res = await client.get(self.config.jwks.uri, timeout=5)
diff --git a/tests/unit/server/test_auth.py b/tests/unit/server/test_auth.py
index 94c486f18..408acb88a 100644
--- a/tests/unit/server/test_auth.py
+++ b/tests/unit/server/test_auth.py
@@ -293,7 +293,7 @@ def oauth2_app():
         config={
             "jwks": {
                 "uri": "http://mock-authz-service/token/introspect",
-                "cache_ttl": "3600",
+                "key_recheck_period": "3600",
             },
             "audience": "llama-stack",
         },

From 6a62e783b905e57c15be351ade856c33752c0dd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Wed, 21 May 2025 17:31:14 +0200
Subject: [PATCH 6/7] chore: refactor workflow writting (#2225)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Use a composite action to avoid similar steps repetitions and
centralization of the defaults.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 .github/actions/setup-runner/action.yml       | 22 ++++++
 .github/workflows/integration-auth-tests.yml  | 12 +---
 .github/workflows/integration-tests.yml       | 18 ++---
 .github/workflows/providers-build.yml         | 69 +++----------------
 .github/workflows/test-external-providers.yml | 12 +---
 .github/workflows/unit-tests.yml              | 14 ++--
 .github/workflows/update-readthedocs.yml      | 12 +---
 7 files changed, 45 insertions(+), 114 deletions(-)
 create mode 100644 .github/actions/setup-runner/action.yml

diff --git a/.github/actions/setup-runner/action.yml b/.github/actions/setup-runner/action.yml
new file mode 100644
index 000000000..972dcbdae
--- /dev/null
+++ b/.github/actions/setup-runner/action.yml
@@ -0,0 +1,22 @@
+name: Setup runner
+description: Prepare a runner for the tests (install uv, python, project dependencies, etc.)
+runs:
+  using: "composite"
+  steps:
+    - name: Install uv
+      uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
+      with:
+        python-version: "3.10"
+        activate-environment: true
+        version: 0.7.6
+
+    - name: Install dependencies
+      shell: bash
+      run: |
+        uv sync --all-extras
+        uv pip install ollama faiss-cpu
+        # always test against the latest version of the client
+        # TODO: this is not necessarily a good idea. we need to test against both published and latest
+        # to find out backwards compatibility issues.
+        uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
+        uv pip install -e .
diff --git a/.github/workflows/integration-auth-tests.yml b/.github/workflows/integration-auth-tests.yml
index 994bd1dec..25f696c9e 100644
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@@ -30,16 +30,11 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
-      - name: Install uv
-        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-        with:
-          python-version: "3.10"
-          activate-environment: true
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
 
-      - name: Set Up Environment and Install Dependencies
+      - name: Build Llama Stack
         run: |
-          uv sync --extra dev --extra test
-          uv pip install -e .
           llama stack build --template ollama --image-type venv
 
       - name: Install minikube
@@ -109,7 +104,6 @@ jobs:
           yq eval '.server.auth.config.jwks = {"uri": "${{ env.KUBERNETES_API_SERVER_URL }}"}' -i $run_dir/run.yaml
           cat $run_dir/run.yaml
 
-          source .venv/bin/activate
           nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &
 
       - name: Wait for Llama Stack server to be ready
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
index da41e2185..2414522a7 100644
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -32,24 +32,14 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
-      - name: Install uv
-        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-        with:
-          python-version: "3.10"
-          activate-environment: true
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
 
       - name: Setup ollama
         uses: ./.github/actions/setup-ollama
 
-      - name: Set Up Environment and Install Dependencies
+      - name: Build Llama Stack
         run: |
-          uv sync --extra dev --extra test
-          uv pip install ollama faiss-cpu
-          # always test against the latest version of the client
-          # TODO: this is not necessarily a good idea. we need to test against both published and latest
-          # to find out backwards compatibility issues.
-          uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
-          uv pip install -e .
           llama stack build --template ollama --image-type venv
 
       - name: Start Llama Stack server in background
@@ -57,7 +47,6 @@ jobs:
         env:
           INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
         run: |
-          source .venv/bin/activate
           LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv &
 
       - name: Wait for Llama Stack server to be ready
@@ -85,6 +74,7 @@ jobs:
             echo "Ollama health check failed"
             exit 1
           fi
+
       - name: Check Storage and Memory Available Before Tests
         if: ${{ always() }}
         run: |
diff --git a/.github/workflows/providers-build.yml b/.github/workflows/providers-build.yml
index 3c1682833..cf53459b9 100644
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@@ -50,21 +50,8 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
-      - name: Set up Python
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: '3.10'
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-        with:
-          python-version: "3.10"
-
-      - name: Install LlamaStack
-        run: |
-          uv venv
-          source .venv/bin/activate
-          uv pip install -e .
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
 
       - name: Print build dependencies
         run: |
@@ -79,7 +66,6 @@ jobs:
       - name: Print dependencies in the image
         if: matrix.image-type == 'venv'
         run: |
-          source test/bin/activate
           uv pip list
 
   build-single-provider:
@@ -88,21 +74,8 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
-      - name: Set up Python
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: '3.10'
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-        with:
-          python-version: "3.10"
-
-      - name: Install LlamaStack
-        run: |
-          uv venv
-          source .venv/bin/activate
-          uv pip install -e .
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
 
       - name: Build a single provider
         run: |
@@ -114,21 +87,8 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
-      - name: Set up Python
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: '3.10'
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-        with:
-          python-version: "3.10"
-
-      - name: Install LlamaStack
-        run: |
-          uv venv
-          source .venv/bin/activate
-          uv pip install -e .
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
 
       - name: Build a single provider
         run: |
@@ -152,21 +112,8 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
-      - name: Set up Python
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: '3.10'
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-        with:
-          python-version: "3.10"
-
-      - name: Install LlamaStack
-        run: |
-          uv venv
-          source .venv/bin/activate
-          uv pip install -e .
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
 
       - name: Pin template to UBI9 base
         run: |
diff --git a/.github/workflows/test-external-providers.yml b/.github/workflows/test-external-providers.yml
index 2e18fc5eb..06ab7cf3c 100644
--- a/.github/workflows/test-external-providers.yml
+++ b/.github/workflows/test-external-providers.yml
@@ -25,15 +25,8 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
-      - name: Install uv
-        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-        with:
-          python-version: "3.10"
-
-      - name: Set Up Environment and Install Dependencies
-        run: |
-          uv sync --extra dev --extra test
-          uv pip install -e .
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
 
       - name: Apply image type to config file
         run: |
@@ -59,7 +52,6 @@ jobs:
         env:
           INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
         run: |
-          source ci-test/bin/activate
           uv run pip list
           nohup uv run --active llama stack run tests/external-provider/llama-stack-provider-ollama/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
 
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index d2dd34e05..fc0459f0f 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -30,17 +30,11 @@ jobs:
           - "3.12"
           - "3.13"
     steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
-      - name: Set up Python ${{ matrix.python }}
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: ${{ matrix.python }}
-
-      - uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-        with:
-          python-version: ${{ matrix.python }}
-          enable-cache: false
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
 
       - name: Run unit tests
         run: |
diff --git a/.github/workflows/update-readthedocs.yml b/.github/workflows/update-readthedocs.yml
index 04e23bca9..981332a77 100644
--- a/.github/workflows/update-readthedocs.yml
+++ b/.github/workflows/update-readthedocs.yml
@@ -37,16 +37,8 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
-      - name: Set up Python
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: '3.11'
-
-      - name: Install the latest version of uv
-        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-
-      - name: Sync with uv
-        run: uv sync --extra docs
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
 
       - name: Build HTML
         run: |

From 85b5f3172b0cf3eb7febcd20cd4df4a60c3c39ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Wed, 21 May 2025 17:35:27 +0200
Subject: [PATCH 7/7] docs: misc cleanup (#2223)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

* remove requirements.txt to use pyproject.toml as the source of truth
* update relevant docs

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 CONTRIBUTING.md       |  7 +---
 docs/readme.md        |  6 +--
 docs/requirements.txt | 16 --------
 docs/source/conf.py   |  8 ----
 pyproject.toml        |  3 ++
 uv.lock               | 88 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 96 insertions(+), 32 deletions(-)
 delete mode 100644 docs/requirements.txt

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d7c3e3e2f..8f71a6ba1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -167,14 +167,11 @@ If you have made changes to a provider's configuration in any form (introducing
 If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.
 
 ```bash
-cd docs
-uv sync --extra docs
-
 # This rebuilds the documentation pages.
-uv run make html
+uv run --with ".[docs]" make -C docs/ html
 
 # This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
-uv run sphinx-autobuild source build/html --write-all
+uv run --with ".[docs]" sphinx-autobuild docs/source docs/build/html --write-all
 ```
 
 ### Update API Documentation
diff --git a/docs/readme.md b/docs/readme.md
index b88a4738d..d84dbe6eb 100644
--- a/docs/readme.md
+++ b/docs/readme.md
@@ -3,10 +3,10 @@
 Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [ReadTheDocs page](https://llama-stack.readthedocs.io/en/latest/index.html).
 
 ## Render locally
+
+From the llama-stack root directory, run the following command to render the docs locally:
 ```bash
-pip install -r requirements.txt
-cd docs
-python -m sphinx_autobuild source _build
+uv run --with ".[docs]" sphinx-autobuild docs/source docs/build/html --write-all
 ```
 You can open up the docs in your browser at http://localhost:8000
 
diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100644
index 6cd45c33b..000000000
--- a/docs/requirements.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-linkify
-myst-parser
--e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
-sphinx==8.1.3
-sphinx-copybutton
-sphinx-design
-sphinx-pdj-theme
-sphinx-rtd-theme>=1.0.0
-sphinx-tabs
-sphinx_autobuild
-sphinx_rtd_dark_mode
-sphinxcontrib-mermaid
-sphinxcontrib-openapi
-sphinxcontrib-redoc
-sphinxcontrib-video
-tomli
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 501a923dd..43e8dbdd5 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -53,14 +53,6 @@ myst_enable_extensions = ["colon_fence"]
 
 html_theme = "sphinx_rtd_theme"
 html_use_relative_paths = True
-
-# html_theme = "sphinx_pdj_theme"
-# html_theme_path = [sphinx_pdj_theme.get_html_theme_path()]
-
-# html_theme = "pytorch_sphinx_theme"
-# html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
-
-
 templates_path = ["_templates"]
 exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 
diff --git a/pyproject.toml b/pyproject.toml
index 8b922bafb..ce44479ca 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -93,6 +93,7 @@ test = [
 docs = [
     "sphinx-autobuild",
     "myst-parser",
+    "sphinx",
     "sphinx-rtd-theme",
     "sphinx_rtd_dark_mode",
     "sphinx-copybutton",
@@ -102,6 +103,8 @@ docs = [
     "sphinxcontrib.video",
     "sphinxcontrib.mermaid",
     "tomli",
+    "linkify",
+    "sphinxcontrib.openapi",
 ]
 codegen = ["rich", "pydantic", "jinja2>=3.1.6"]
 ui = [
diff --git a/uv.lock b/uv.lock
index a987ddc9e..6d091193b 100644
--- a/uv.lock
+++ b/uv.lock
@@ -628,6 +628,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186", size = 9073 },
 ]
 
+[[package]]
+name = "deepmerge"
+version = "2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a8/3a/b0ba594708f1ad0bc735884b3ad854d3ca3bdc1d741e56e40bbda6263499/deepmerge-2.0.tar.gz", hash = "sha256:5c3d86081fbebd04dd5de03626a0607b809a98fb6ccba5770b62466fe940ff20", size = 19890 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2d/82/e5d2c1c67d19841e9edc74954c827444ae826978499bde3dfc1d007c8c11/deepmerge-2.0-py3-none-any.whl", hash = "sha256:6de9ce507115cff0bed95ff0ce9ecc31088ef50cbdf09bc90a09349a318b3d00", size = 13475 },
+]
+
 [[package]]
 name = "deprecated"
 version = "1.2.18"
@@ -1384,6 +1393,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/dc/1e/408fd10217eac0e43aea0604be22b4851a09e03d761d44d4ea12089dd70e/levenshtein-0.27.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:7987ef006a3cf56a4532bd4c90c2d3b7b4ca9ad3bf8ae1ee5713c4a3bdfda913", size = 98045 },
 ]
 
+[[package]]
+name = "linkify"
+version = "1.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/65/c6/246100fa3967074d9725b3716913bd495823547bde5047050d4c3462f994/linkify-1.4.tar.gz", hash = "sha256:9ba276ba179525f7262820d90f009604e51cd4f1466c1112b882ef7eda243d5e", size = 1749 }
+
 [[package]]
 name = "llama-stack"
 version = "0.2.7"
@@ -1434,7 +1449,9 @@ dev = [
     { name = "uvicorn" },
 ]
 docs = [
+    { name = "linkify" },
     { name = "myst-parser" },
+    { name = "sphinx" },
     { name = "sphinx-autobuild" },
     { name = "sphinx-copybutton" },
     { name = "sphinx-design" },
@@ -1442,6 +1459,7 @@ docs = [
     { name = "sphinx-rtd-theme" },
     { name = "sphinx-tabs" },
     { name = "sphinxcontrib-mermaid" },
+    { name = "sphinxcontrib-openapi" },
     { name = "sphinxcontrib-redoc" },
     { name = "sphinxcontrib-video" },
     { name = "tomli" },
@@ -1500,6 +1518,7 @@ requires-dist = [
     { name = "jinja2", specifier = ">=3.1.6" },
     { name = "jinja2", marker = "extra == 'codegen'", specifier = ">=3.1.6" },
     { name = "jsonschema" },
+    { name = "linkify", marker = "extra == 'docs'" },
     { name = "llama-stack-client", specifier = ">=0.2.7" },
     { name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.7" },
     { name = "mcp", marker = "extra == 'test'" },
@@ -1534,6 +1553,7 @@ requires-dist = [
     { name = "ruamel-yaml", marker = "extra == 'dev'" },
     { name = "ruff", marker = "extra == 'dev'" },
     { name = "setuptools" },
+    { name = "sphinx", marker = "extra == 'docs'" },
     { name = "sphinx-autobuild", marker = "extra == 'docs'" },
     { name = "sphinx-copybutton", marker = "extra == 'docs'" },
     { name = "sphinx-design", marker = "extra == 'docs'" },
@@ -1541,6 +1561,7 @@ requires-dist = [
     { name = "sphinx-rtd-theme", marker = "extra == 'docs'" },
     { name = "sphinx-tabs", marker = "extra == 'docs'" },
     { name = "sphinxcontrib-mermaid", marker = "extra == 'docs'" },
+    { name = "sphinxcontrib-openapi", marker = "extra == 'docs'" },
     { name = "sphinxcontrib-redoc", marker = "extra == 'docs'" },
     { name = "sphinxcontrib-video", marker = "extra == 'docs'" },
     { name = "sqlite-vec", marker = "extra == 'unit'" },
@@ -1786,6 +1807,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
 ]
 
+[[package]]
+name = "mistune"
+version = "3.1.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c4/79/bda47f7dd7c3c55770478d6d02c9960c430b0cf1773b72366ff89126ea31/mistune-3.1.3.tar.gz", hash = "sha256:a7035c21782b2becb6be62f8f25d3df81ccb4d6fa477a6525b15af06539f02a0", size = 94347 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/01/4d/23c4e4f09da849e127e9f123241946c23c1e30f45a88366879e064211815/mistune-3.1.3-py3-none-any.whl", hash = "sha256:1a32314113cff28aa6432e99e522677c8587fd83e3d51c29b82a52409c842bd9", size = 53410 },
+]
+
 [[package]]
 name = "mpmath"
 version = "1.3.0"
@@ -2228,6 +2261,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772 },
 ]
 
+[[package]]
+name = "picobox"
+version = "4.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0f/b1/830714dd6778c1cb45826722b4e9bd21c94b33cca5df9ef2cc0b80c81b25/picobox-4.0.0.tar.gz", hash = "sha256:114da1b5606b2f615e8b0eb68d04198ad9de75af5adbcf5b36fe4f664ab927b6", size = 22666 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2d/c6/fd64ffd75d47c4fcf6c65808cc5c5c75e5d4357c197d3741ee1339e91257/picobox-4.0.0-py3-none-any.whl", hash = "sha256:4c27eb689fe45dabd9e64c382e04418147d0b746d155b4e80057dbb7ff82027e", size = 11641 },
+]
+
 [[package]]
 name = "pillow"
 version = "11.1.0"
@@ -3516,6 +3558,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c6/43/65c0acbd8cc6f50195a3a1fc195c404988b15c67090e73c7a41a9f57d6bd/sphinx_design-0.6.1-py3-none-any.whl", hash = "sha256:b11f37db1a802a183d61b159d9a202314d4d2fe29c163437001324fe2f19549c", size = 2215338 },
 ]
 
+[[package]]
+name = "sphinx-mdinclude"
+version = "0.6.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "docutils" },
+    { name = "mistune" },
+    { name = "pygments" },
+    { name = "sphinx" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b6/a7/c9a7888bb2187fdb06955d71e75f6f266b7e179b356ac76138d160a5b7eb/sphinx_mdinclude-0.6.2.tar.gz", hash = "sha256:447462e82cb8be61404a2204227f920769eb923d2f57608e3325f3bb88286b4c", size = 65257 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/42/3d/6b41fe1637cd53c4b10d56e0e6f396546f837973dabf9c4b2a1de44620ac/sphinx_mdinclude-0.6.2-py3-none-any.whl", hash = "sha256:648e78edb067c0e4bffc22943278d49d54a0714494743592032fa3ad82a86984", size = 16911 },
+]
+
 [[package]]
 name = "sphinx-rtd-dark-mode"
 version = "1.3.0"
@@ -3583,6 +3640,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8", size = 98705 },
 ]
 
+[[package]]
+name = "sphinxcontrib-httpdomain"
+version = "1.8.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "six" },
+    { name = "sphinx" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/be/ef/82d3cfafb7febce4f7df8dcf3cde9d072350b41066e05a4f559b4e9105d0/sphinxcontrib-httpdomain-1.8.1.tar.gz", hash = "sha256:6c2dfe6ca282d75f66df333869bb0ce7331c01b475db6809ff9d107b7cdfe04b", size = 19266 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/02/49/aad47b8cf27a0d7703f1311aad8c368bb22866ddee1a2d2cd3f69bc45e0c/sphinxcontrib_httpdomain-1.8.1-py2.py3-none-any.whl", hash = "sha256:21eefe1270e4d9de8d717cc89ee92cc4871b8736774393bafc5e38a6bb77b1d5", size = 25513 },
+]
+
 [[package]]
 name = "sphinxcontrib-jquery"
 version = "4.1"
@@ -3617,6 +3687,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cd/c8/784b9ac6ea08aa594c1a4becbd0dbe77186785362e31fd633b8c6ae0197a/sphinxcontrib_mermaid-1.0.0-py3-none-any.whl", hash = "sha256:60b72710ea02087f212028feb09711225fbc2e343a10d34822fe787510e1caa3", size = 9597 },
 ]
 
+[[package]]
+name = "sphinxcontrib-openapi"
+version = "0.8.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "deepmerge" },
+    { name = "jsonschema" },
+    { name = "picobox" },
+    { name = "pyyaml" },
+    { name = "sphinx" },
+    { name = "sphinx-mdinclude" },
+    { name = "sphinxcontrib-httpdomain" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c0/a7/66a5c9aba7dbbb0c2b050f60e71402818cbf5f127ace13ed971029cc745e/sphinxcontrib-openapi-0.8.4.tar.gz", hash = "sha256:df883808a5b5e4b4113ad697185c43a3f42df3dce70453af78ba7076907e9a20", size = 71848 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d5/c3/ee00486f38d78309a60ee0d6031b2545b22ac5f0007d841dd174abc68774/sphinxcontrib_openapi-0.8.4-py3-none-any.whl", hash = "sha256:50911c18d452d9390ee3a384ef8dc8bde6135f542ba55691f81e1fbc0b71014e", size = 34510 },
+]
+
 [[package]]
 name = "sphinxcontrib-qthelp"
 version = "2.0.0"