Merge branch 'main' into feat/sambanova-safety

2025-07-31 16:01:46 +00:00 · 2025-05-21 11:32:42 -05:00 · 2025-05-21 11:32:42 -05:00 · e12df4293b
commit e12df4293b
parent f28dfc73d5 85b5f3172b
26 changed files with 1094 additions and 494 deletions
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -0,0 +1,22 @@
+name: Setup runner
+description: Prepare a runner for the tests (install uv, python, project dependencies, etc.)
+runs:
+  using: "composite"
+  steps:
+    - name: Install uv
+      uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
+      with:
+        python-version: "3.10"
+        activate-environment: true
+        version: 0.7.6
+
+    - name: Install dependencies
+      shell: bash
+      run: |
+        uv sync --all-extras
+        uv pip install ollama faiss-cpu
+        # always test against the latest version of the client
+        # TODO: this is not necessarily a good idea. we need to test against both published and latest
+        # to find out backwards compatibility issues.
+        uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
+        uv pip install -e .
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -23,23 +23,18 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        auth-provider: [kubernetes]
+        auth-provider: [oauth2_token]
      fail-fast: false # we want to run all tests regardless of failure

    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

-      - name: Install uv
-        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-        with:
-          python-version: "3.10"
-          activate-environment: true
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner

-      - name: Set Up Environment and Install Dependencies
+      - name: Build Llama Stack
        run: |
-          uv sync --extra dev --extra test
-          uv pip install -e .
          llama stack build --template ollama --image-type venv

      - name: Install minikube
@ -47,29 +42,53 @@ jobs:
        uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # v0.0.19

      - name: Start minikube
-        if: ${{ matrix.auth-provider == 'kubernetes' }}
+        if: ${{ matrix.auth-provider == 'oauth2_token' }}
        run: |
          minikube start
          kubectl get pods -A

      - name: Configure Kube Auth
-        if: ${{ matrix.auth-provider == 'kubernetes' }}
+        if: ${{ matrix.auth-provider == 'oauth2_token' }}
        run: |
          kubectl create namespace llama-stack
          kubectl create serviceaccount llama-stack-auth -n llama-stack
          kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --serviceaccount=llama-stack:llama-stack-auth -n llama-stack
          kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
+          cat <<EOF | kubectl apply -f -
+          apiVersion: rbac.authorization.k8s.io/v1
+          kind: ClusterRole
+          metadata:
+            name: allow-anonymous-openid
+          rules:
+          - nonResourceURLs: ["/openid/v1/jwks"]
+            verbs: ["get"]
+          ---
+          apiVersion: rbac.authorization.k8s.io/v1
+          kind: ClusterRoleBinding
+          metadata:
+            name: allow-anonymous-openid
+          roleRef:
+            apiGroup: rbac.authorization.k8s.io
+            kind: ClusterRole
+            name: allow-anonymous-openid
+          subjects:
+          - kind: User
+            name: system:anonymous
+            apiGroup: rbac.authorization.k8s.io
+          EOF

      - name: Set Kubernetes Config
-        if: ${{ matrix.auth-provider == 'kubernetes' }}
+        if: ${{ matrix.auth-provider == 'oauth2_token' }}
        run: |
-          echo "KUBERNETES_API_SERVER_URL=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.server}')" >> $GITHUB_ENV
+          echo "KUBERNETES_API_SERVER_URL=$(kubectl get --raw /.well-known/openid-configuration| jq -r .jwks_uri)" >> $GITHUB_ENV
          echo "KUBERNETES_CA_CERT_PATH=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}')" >> $GITHUB_ENV
+          echo "KUBERNETES_ISSUER=$(kubectl get --raw /.well-known/openid-configuration| jq -r .issuer)" >> $GITHUB_ENV
+          echo "KUBERNETES_AUDIENCE=$(kubectl create token default --duration=1h | cut -d. -f2 | base64 -d | jq -r '.aud[0]')" >> $GITHUB_ENV

      - name: Set Kube Auth Config and run server
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
-        if: ${{ matrix.auth-provider == 'kubernetes' }}
+        if: ${{ matrix.auth-provider == 'oauth2_token' }}
        run: |
          run_dir=$(mktemp -d)
          cat <<'EOF' > $run_dir/run.yaml
@ -81,10 +100,10 @@ jobs:
            port: 8321
          EOF
          yq eval '.server.auth = {"provider_type": "${{ matrix.auth-provider }}"}' -i $run_dir/run.yaml
-          yq eval '.server.auth.config = {"api_server_url": "${{ env.KUBERNETES_API_SERVER_URL }}", "ca_cert_path": "${{ env.KUBERNETES_CA_CERT_PATH }}"}' -i $run_dir/run.yaml
+          yq eval '.server.auth.config = {"tls_cafile": "${{ env.KUBERNETES_CA_CERT_PATH }}", "issuer": "${{ env.KUBERNETES_ISSUER }}", "audience": "${{ env.KUBERNETES_AUDIENCE }}"}' -i $run_dir/run.yaml
+          yq eval '.server.auth.config.jwks = {"uri": "${{ env.KUBERNETES_API_SERVER_URL }}"}' -i $run_dir/run.yaml
          cat $run_dir/run.yaml

-          source .venv/bin/activate
          nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &

      - name: Wait for Llama Stack server to be ready
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -32,24 +32,14 @@ jobs:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

-      - name: Install uv
-        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-        with:
-          python-version: "3.10"
-          activate-environment: true
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner

      - name: Setup ollama
        uses: ./.github/actions/setup-ollama

-      - name: Set Up Environment and Install Dependencies
+      - name: Build Llama Stack
        run: |
-          uv sync --extra dev --extra test
-          uv pip install ollama faiss-cpu
-          # always test against the latest version of the client
-          # TODO: this is not necessarily a good idea. we need to test against both published and latest
-          # to find out backwards compatibility issues.
-          uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
-          uv pip install -e .
          llama stack build --template ollama --image-type venv

      - name: Start Llama Stack server in background
@ -57,7 +47,6 @@ jobs:
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        run: |
-          source .venv/bin/activate
          LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv &

      - name: Wait for Llama Stack server to be ready
@ -85,6 +74,7 @@ jobs:
            echo "Ollama health check failed"
            exit 1
          fi
+
      - name: Check Storage and Memory Available Before Tests
        if: ${{ always() }}
        run: |
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -50,21 +50,8 @@ jobs:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

-      - name: Set up Python
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: '3.10'
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-        with:
-          python-version: "3.10"
-
-      - name: Install LlamaStack
-        run: |
-          uv venv
-          source .venv/bin/activate
-          uv pip install -e .
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner

      - name: Print build dependencies
        run: |
@ -79,7 +66,6 @@ jobs:
      - name: Print dependencies in the image
        if: matrix.image-type == 'venv'
        run: |
-          source test/bin/activate
          uv pip list

  build-single-provider:
@ -88,21 +74,8 @@ jobs:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

-      - name: Set up Python
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: '3.10'
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-        with:
-          python-version: "3.10"
-
-      - name: Install LlamaStack
-        run: |
-          uv venv
-          source .venv/bin/activate
-          uv pip install -e .
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner

      - name: Build a single provider
        run: |
@ -114,21 +87,8 @@ jobs:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

-      - name: Set up Python
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: '3.10'
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-        with:
-          python-version: "3.10"
-
-      - name: Install LlamaStack
-        run: |
-          uv venv
-          source .venv/bin/activate
-          uv pip install -e .
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner

      - name: Build a single provider
        run: |
@ -152,21 +112,8 @@ jobs:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

-      - name: Set up Python
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: '3.10'
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-        with:
-          python-version: "3.10"
-
-      - name: Install LlamaStack
-        run: |
-          uv venv
-          source .venv/bin/activate
-          uv pip install -e .
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner

      - name: Pin template to UBI9 base
        run: |
--- a/.github/workflows/test-external-providers.yml
+++ b/.github/workflows/test-external-providers.yml
@ -25,15 +25,8 @@ jobs:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

-      - name: Install uv
-        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-        with:
-          python-version: "3.10"
-
-      - name: Set Up Environment and Install Dependencies
-        run: |
-          uv sync --extra dev --extra test
-          uv pip install -e .
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner

      - name: Apply image type to config file
        run: |
@ -59,7 +52,6 @@ jobs:
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        run: |
-          source ci-test/bin/activate
          uv run pip list
          nohup uv run --active llama stack run tests/external-provider/llama-stack-provider-ollama/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &

--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@ -30,17 +30,11 @@ jobs:
          - "3.12"
          - "3.13"
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

-      - name: Set up Python ${{ matrix.python }}
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: ${{ matrix.python }}
-
-      - uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-        with:
-          python-version: ${{ matrix.python }}
-          enable-cache: false
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner

      - name: Run unit tests
        run: |
--- a/.github/workflows/update-readthedocs.yml
+++ b/.github/workflows/update-readthedocs.yml
@ -37,16 +37,8 @@ jobs:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

-      - name: Set up Python
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: '3.11'
-
-      - name: Install the latest version of uv
-        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-
-      - name: Sync with uv
-        run: uv sync --extra docs
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner

      - name: Build HTML
        run: |
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -167,14 +167,11 @@ If you have made changes to a provider's configuration in any form (introducing
 If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.

 ```bash
-cd docs
-uv sync --extra docs
-
 # This rebuilds the documentation pages.
-uv run make html
+uv run --with ".[docs]" make -C docs/ html

 # This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
-uv run sphinx-autobuild source build/html --write-all
+uv run --with ".[docs]" sphinx-autobuild docs/source docs/build/html --write-all
 ```

 ### Update API Documentation
--- a/docs/readme.md
+++ b/docs/readme.md
@ -3,10 +3,10 @@
 Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [ReadTheDocs page](https://llama-stack.readthedocs.io/en/latest/index.html).

 ## Render locally
+
+From the llama-stack root directory, run the following command to render the docs locally:
 ```bash
-pip install -r requirements.txt
-cd docs
-python -m sphinx_autobuild source _build
+uv run --with ".[docs]" sphinx-autobuild docs/source docs/build/html --write-all
 ```
 You can open up the docs in your browser at http://localhost:8000

--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1,16 +0,0 @@
-linkify
-myst-parser
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
-sphinx==8.1.3
-sphinx-copybutton
-sphinx-design
-sphinx-pdj-theme
-sphinx-rtd-theme>=1.0.0
-sphinx-tabs
-sphinx_autobuild
-sphinx_rtd_dark_mode
-sphinxcontrib-mermaid
-sphinxcontrib-openapi
-sphinxcontrib-redoc
-sphinxcontrib-video
-tomli
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -53,14 +53,6 @@ myst_enable_extensions = ["colon_fence"]

 html_theme = "sphinx_rtd_theme"
 html_use_relative_paths = True
-
-# html_theme = "sphinx_pdj_theme"
-# html_theme_path = [sphinx_pdj_theme.get_html_theme_path()]
-
-# html_theme = "pytorch_sphinx_theme"
-# html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
-
-
 templates_path = ["_templates"]
 exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]

--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -338,6 +338,48 @@ INFO:     Application startup complete.
 INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
 INFO:     2401:db00:35c:2d2b:face:0:c9:0:54678 - "GET /models/list HTTP/1.1" 200 OK
 ```
+### Listing Distributions
+Using the list command, you can view all existing Llama Stack distributions, including stacks built from templates, from scratch, or using custom configuration files.
+
+```
+llama stack list -h
+usage: llama stack list [-h]
+
+list the build stacks
+
+options:
+  -h, --help  show this help message and exit
+```
+
+Example Usage
+
+```
+llama stack list
+```
+
+### Removing a Distribution
+Use the remove command to delete a distribution you've previously built.
+
+```
+llama stack rm -h
+usage: llama stack rm [-h] [--all] [name]
+
+Remove the build stack
+
+positional arguments:
+  name        Name of the stack to delete (default: None)
+
+options:
+  -h, --help  show this help message and exit
+  --all, -a   Delete all stacks (use with caution) (default: False)
+```
+
+Example
+```
+llama stack rm llamastack-test
+```
+
+To keep your environment organized and avoid clutter, consider using `llama stack list` to review old or unused distributions and `llama stack rm <name>` to delete them when they’re no longer needed.

 ### Troubleshooting

--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@ -118,11 +118,6 @@ server:
  port: 8321  # Port to listen on (default: 8321)
  tls_certfile: "/path/to/cert.pem"  # Optional: Path to TLS certificate for HTTPS
  tls_keyfile: "/path/to/key.pem"    # Optional: Path to TLS key for HTTPS
-  auth:                              # Optional: Authentication configuration
-    provider_type: "kubernetes"      # Type of auth provider
-    config:                          # Provider-specific configuration
-      api_server_url: "https://kubernetes.default.svc"
-      ca_cert_path: "/path/to/ca.crt" # Optional: Path to CA certificate
 ```

 ### Authentication Configuration
@ -135,7 +130,7 @@ Authorization: Bearer <token>

 The server supports multiple authentication providers:

-#### Kubernetes Provider
+#### OAuth 2.0/OpenID Connect Provider with Kubernetes

 The Kubernetes cluster must be configured to use a service account for authentication.

@ -146,14 +141,67 @@ kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --se
 kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
 ```

-Validates tokens against the Kubernetes API server:
+Make sure the `kube-apiserver` runs with `--anonymous-auth=true` to allow unauthenticated requests
+and that the correct RoleBinding is created to allow the service account to access the necessary
+resources. If that is not the case, you can create a RoleBinding for the service account to access
+the necessary resources:
+
+```yaml
+# allow-anonymous-openid.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: allow-anonymous-openid
+rules:
+- nonResourceURLs: ["/openid/v1/jwks"]
+  verbs: ["get"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: allow-anonymous-openid
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: allow-anonymous-openid
+subjects:
+- kind: User
+  name: system:anonymous
+  apiGroup: rbac.authorization.k8s.io
+```
+
+And then apply the configuration:
+```bash
+kubectl apply -f allow-anonymous-openid.yaml
+```
+
+Validates tokens against the Kubernetes API server through the OIDC provider:
 ```yaml
 server:
  auth:
-    provider_type: "kubernetes"
+    provider_type: "oauth2_token"
    config:
-      api_server_url: "https://kubernetes.default.svc"  # URL of the Kubernetes API server
-      ca_cert_path: "/path/to/ca.crt"                   # Optional: Path to CA certificate
+      jwks:
+        uri: "https://kubernetes.default.svc"
+        key_recheck_period: 3600
+      tls_cafile: "/path/to/ca.crt"
+      issuer: "https://kubernetes.default.svc"
+      audience: "https://kubernetes.default.svc"
+```
+
+To find your cluster's audience, run:
+```bash
+kubectl create token default --duration=1h | cut -d. -f2 | base64 -d | jq .aud
+```
+
+For the issuer, you can use the OIDC provider's URL:
+```bash
+kubectl get --raw /.well-known/openid-configuration| jq .issuer
+```
+
+For the tls_cafile, you can use the CA certificate of the OIDC provider:
+```bash
+kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}'
 ```

 The provider extracts user information from the JWT token:
@ -208,6 +256,80 @@ And must respond with:

 If no access attributes are returned, the token is used as a namespace.

+### Quota Configuration
+
+The `quota` section allows you to enable server-side request throttling for both
+authenticated and anonymous clients. This is useful for preventing abuse, enforcing
+fairness across tenants, and controlling infrastructure costs without requiring
+client-side rate limiting or external proxies.
+
+Quotas are disabled by default. When enabled, each client is tracked using either:
+
+* Their authenticated `client_id` (derived from the Bearer token), or
+* Their IP address (fallback for anonymous requests)
+
+Quota state is stored in a SQLite-backed key-value store, and rate limits are applied
+within a configurable time window (currently only `day` is supported).
+
+#### Example
+
+```yaml
+server:
+  quota:
+    kvstore:
+      type: sqlite
+      db_path: ./quotas.db
+    anonymous_max_requests: 100
+    authenticated_max_requests: 1000
+    period: day
+```
+
+#### Configuration Options
+
+| Field                        | Description                                                                |
+| ---------------------------- | -------------------------------------------------------------------------- |
+| `kvstore`                    | Required. Backend storage config for tracking request counts.              |
+| `kvstore.type`               | Must be `"sqlite"` for now. Other backends may be supported in the future. |
+| `kvstore.db_path`            | File path to the SQLite database.                                          |
+| `anonymous_max_requests`     | Max requests per period for unauthenticated clients.                       |
+| `authenticated_max_requests` | Max requests per period for authenticated clients.                         |
+| `period`                     | Time window for quota enforcement. Only `"day"` is supported.              |
+
+> Note: if `authenticated_max_requests` is set but no authentication provider is
+configured, the server will fall back to applying `anonymous_max_requests` to all
+clients.
+
+#### Example with Authentication Enabled
+
+```yaml
+server:
+  port: 8321
+  auth:
+    provider_type: custom
+    config:
+      endpoint: https://auth.example.com/validate
+  quota:
+    kvstore:
+      type: sqlite
+      db_path: ./quotas.db
+    anonymous_max_requests: 100
+    authenticated_max_requests: 1000
+    period: day
+```
+
+If a client exceeds their limit, the server responds with:
+
+```http
+HTTP/1.1 429 Too Many Requests
+Content-Type: application/json
+
+{
+  "error": {
+    "message": "Quota exceeded"
+  }
+}
+```
+
 ## Extending to handle Safety

 Configuring Safety can be a little involved so it is instructive to go through an example.
--- a/llama_stack/cli/stack/list_stacks.py
+++ b/llama_stack/cli/stack/list_stacks.py
@ -0,0 +1,56 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+from pathlib import Path
+
+from llama_stack.cli.subcommand import Subcommand
+from llama_stack.cli.table import print_table
+
+
+class StackListBuilds(Subcommand):
+    """List built stacks in .llama/distributions directory"""
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "list",
+            prog="llama stack list",
+            description="list the build stacks",
+            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._list_stack_command)
+
+    def _get_distribution_dirs(self) -> dict[str, Path]:
+        """Return a dictionary of distribution names and their paths"""
+        distributions = {}
+        dist_dir = Path.home() / ".llama" / "distributions"
+
+        if dist_dir.exists():
+            for stack_dir in dist_dir.iterdir():
+                if stack_dir.is_dir():
+                    distributions[stack_dir.name] = stack_dir
+        return distributions
+
+    def _list_stack_command(self, args: argparse.Namespace) -> None:
+        distributions = self._get_distribution_dirs()
+
+        if not distributions:
+            print("No stacks found in ~/.llama/distributions")
+            return
+
+        headers = ["Stack Name", "Path"]
+        headers.extend(["Build Config", "Run Config"])
+        rows = []
+        for name, path in distributions.items():
+            row = [name, str(path)]
+            # Check for build and run config files
+            build_config = "Yes" if (path / f"{name}-build.yaml").exists() else "No"
+            run_config = "Yes" if (path / f"{name}-run.yaml").exists() else "No"
+            row.extend([build_config, run_config])
+            rows.append(row)
+        print_table(rows, headers, separate_rows=True)
--- a/llama_stack/cli/stack/remove.py
+++ b/llama_stack/cli/stack/remove.py
@ -0,0 +1,116 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+import shutil
+import sys
+from pathlib import Path
+
+from termcolor import cprint
+
+from llama_stack.cli.subcommand import Subcommand
+from llama_stack.cli.table import print_table
+
+
+class StackRemove(Subcommand):
+    """Remove the build stack"""
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "rm",
+            prog="llama stack rm",
+            description="Remove the build stack",
+            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._remove_stack_build_command)
+
+    def _add_arguments(self) -> None:
+        self.parser.add_argument(
+            "name",
+            type=str,
+            nargs="?",
+            help="Name of the stack to delete",
+        )
+        self.parser.add_argument(
+            "--all",
+            "-a",
+            action="store_true",
+            help="Delete all stacks (use with caution)",
+        )
+
+    def _get_distribution_dirs(self) -> dict[str, Path]:
+        """Return a dictionary of distribution names and their paths"""
+        distributions = {}
+        dist_dir = Path.home() / ".llama" / "distributions"
+
+        if dist_dir.exists():
+            for stack_dir in dist_dir.iterdir():
+                if stack_dir.is_dir():
+                    distributions[stack_dir.name] = stack_dir
+        return distributions
+
+    def _list_stacks(self) -> None:
+        """Display available stacks in a table"""
+        distributions = self._get_distribution_dirs()
+        if not distributions:
+            print("No stacks found in ~/.llama/distributions")
+            return
+
+        headers = ["Stack Name", "Path"]
+        rows = [[name, str(path)] for name, path in distributions.items()]
+        print_table(rows, headers, separate_rows=True)
+
+    def _remove_stack_build_command(self, args: argparse.Namespace) -> None:
+        distributions = self._get_distribution_dirs()
+
+        if args.all:
+            confirm = input("Are you sure you want to delete ALL stacks? [yes-i-really-want/N] ").lower()
+            if confirm != "yes-i-really-want":
+                print("Deletion cancelled.")
+                return
+
+            for name, path in distributions.items():
+                try:
+                    shutil.rmtree(path)
+                    print(f"Deleted stack: {name}")
+                except Exception as e:
+                    cprint(
+                        f"Failed to delete stack {name}: {e}",
+                        color="red",
+                    )
+                    sys.exit(2)
+
+        if not args.name:
+            self._list_stacks()
+            if not args.name:
+                return
+
+        if args.name not in distributions:
+            self._list_stacks()
+            cprint(
+                f"Stack not found: {args.name}",
+                color="red",
+            )
+            return
+
+        stack_path = distributions[args.name]
+
+        confirm = input(f"Are you sure you want to delete stack '{args.name}'? [y/N] ").lower()
+        if confirm != "y":
+            print("Deletion cancelled.")
+            return
+
+        try:
+            shutil.rmtree(stack_path)
+            print(f"Successfully deleted stack: {args.name}")
+        except Exception as e:
+            cprint(
+                f"Failed to delete stack {args.name}: {e}",
+                color="red",
+            )
+            sys.exit(2)
--- a/llama_stack/cli/stack/stack.py
+++ b/llama_stack/cli/stack/stack.py
@ -7,12 +7,14 @@
 import argparse
 from importlib.metadata import version

+from llama_stack.cli.stack.list_stacks import StackListBuilds
 from llama_stack.cli.stack.utils import print_subcommand_description
 from llama_stack.cli.subcommand import Subcommand

 from .build import StackBuild
 from .list_apis import StackListApis
 from .list_providers import StackListProviders
+from .remove import StackRemove
 from .run import StackRun


@ -41,5 +43,6 @@ class StackParser(Subcommand):
        StackListApis.create(subparsers)
        StackListProviders.create(subparsers)
        StackRun.create(subparsers)
-
+        StackRemove.create(subparsers)
+        StackListBuilds.create(subparsers)
        print_subcommand_description(self.parser, subparsers)
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@ -25,7 +25,7 @@ from llama_stack.apis.tools import Tool, ToolGroup, ToolGroupInput, ToolRuntime
 from llama_stack.apis.vector_dbs import VectorDB, VectorDBInput
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.providers.datatypes import Api, ProviderSpec
-from llama_stack.providers.utils.kvstore.config import KVStoreConfig
+from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig

 LLAMA_STACK_BUILD_CONFIG_VERSION = "2"
 LLAMA_STACK_RUN_CONFIG_VERSION = "2"
@ -220,21 +220,34 @@ class LoggingConfig(BaseModel):
 class AuthProviderType(str, Enum):
    """Supported authentication provider types."""

-    KUBERNETES = "kubernetes"
+    OAUTH2_TOKEN = "oauth2_token"
    CUSTOM = "custom"


 class AuthenticationConfig(BaseModel):
    provider_type: AuthProviderType = Field(
        ...,
-        description="Type of authentication provider (e.g., 'kubernetes', 'custom')",
+        description="Type of authentication provider",
    )
-    config: dict[str, str] = Field(
+    config: dict[str, Any] = Field(
        ...,
        description="Provider-specific configuration",
    )


+class QuotaPeriod(str, Enum):
+    DAY = "day"
+
+
+class QuotaConfig(BaseModel):
+    kvstore: SqliteKVStoreConfig = Field(description="Config for KV store backend (SQLite only for now)")
+    anonymous_max_requests: int = Field(default=100, description="Max requests for unauthenticated clients per period")
+    authenticated_max_requests: int = Field(
+        default=1000, description="Max requests for authenticated clients per period"
+    )
+    period: QuotaPeriod = Field(default=QuotaPeriod.DAY, description="Quota period to set")
+
+
 class ServerConfig(BaseModel):
    port: int = Field(
        default=8321,
@ -262,6 +275,10 @@ class ServerConfig(BaseModel):
        default=None,
        description="The host the server should listen on",
    )
+    quota: QuotaConfig | None = Field(
+        default=None,
+        description="Per client quota request configuration",
+    )


 class StackRunConfig(BaseModel):
--- a/llama_stack/distribution/server/auth.py
+++ b/llama_stack/distribution/server/auth.py
@ -8,7 +8,8 @@ import json

 import httpx

-from llama_stack.distribution.server.auth_providers import AuthProviderConfig, create_auth_provider
+from llama_stack.distribution.datatypes import AuthenticationConfig
+from llama_stack.distribution.server.auth_providers import create_auth_provider
 from llama_stack.log import get_logger

 logger = get_logger(name=__name__, category="auth")
@ -77,7 +78,7 @@ class AuthenticationMiddleware:
    access resources that don't have access_attributes defined.
    """

-    def __init__(self, app, auth_config: AuthProviderConfig):
+    def __init__(self, app, auth_config: AuthenticationConfig):
        self.app = app
        self.auth_provider = create_auth_provider(auth_config)

@ -113,6 +114,10 @@ class AuthenticationMiddleware:
                    "roles": [token],
                }

+            # Store the client ID in the request scope so that downstream middleware (like QuotaMiddleware)
+            # can identify the requester and enforce per-client rate limits.
+            scope["authenticated_client_id"] = token
+
            # Store attributes in request scope
            scope["user_attributes"] = user_attributes
            scope["principal"] = validation_result.principal
--- a/llama_stack/distribution/server/auth_providers.py
+++ b/llama_stack/distribution/server/auth_providers.py
@ -4,18 +4,19 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-import json
+import ssl
 import time
 from abc import ABC, abstractmethod
 from asyncio import Lock
-from enum import Enum
+from pathlib import Path
 from urllib.parse import parse_qs

 import httpx
 from jose import jwt
-from pydantic import BaseModel, Field, field_validator
+from pydantic import BaseModel, Field, field_validator, model_validator
+from typing_extensions import Self

-from llama_stack.distribution.datatypes import AccessAttributes
+from llama_stack.distribution.datatypes import AccessAttributes, AuthenticationConfig, AuthProviderType
 from llama_stack.log import get_logger

 logger = get_logger(name=__name__, category="auth")
@ -73,21 +74,6 @@ class AuthRequest(BaseModel):
    request: AuthRequestContext = Field(description="Context information about the request being authenticated")


-class AuthProviderType(str, Enum):
-    """Supported authentication provider types."""
-
-    KUBERNETES = "kubernetes"
-    CUSTOM = "custom"
-    OAUTH2_TOKEN = "oauth2_token"
-
-
-class AuthProviderConfig(BaseModel):
-    """Base configuration for authentication providers."""
-
-    provider_type: AuthProviderType = Field(..., description="Type of authentication provider")
-    config: dict[str, str] = Field(..., description="Provider-specific configuration")
-
-
 class AuthProvider(ABC):
    """Abstract base class for authentication providers."""

@ -102,83 +88,6 @@ class AuthProvider(ABC):
        pass


-class KubernetesAuthProviderConfig(BaseModel):
-    api_server_url: str
-    ca_cert_path: str | None = None
-
-
-class KubernetesAuthProvider(AuthProvider):
-    """Kubernetes authentication provider that validates tokens against the Kubernetes API server."""
-
-    def __init__(self, config: KubernetesAuthProviderConfig):
-        self.config = config
-        self._client = None
-
-    async def _get_client(self):
-        """Get or create a Kubernetes client."""
-        if self._client is None:
-            # kubernetes-client has not async support, see:
-            # https://github.com/kubernetes-client/python/issues/323
-            from kubernetes import client
-            from kubernetes.client import ApiClient
-
-            # Configure the client
-            configuration = client.Configuration()
-            configuration.host = self.config.api_server_url
-            if self.config.ca_cert_path:
-                configuration.ssl_ca_cert = self.config.ca_cert_path
-            configuration.verify_ssl = bool(self.config.ca_cert_path)
-
-            # Create API client
-            self._client = ApiClient(configuration)
-        return self._client
-
-    async def validate_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
-        """Validate a Kubernetes token and return access attributes."""
-        try:
-            client = await self._get_client()
-
-            # Set the token in the client
-            client.set_default_header("Authorization", f"Bearer {token}")
-
-            # Make a request to validate the token
-            # We use the /api endpoint which requires authentication
-            from kubernetes.client import CoreV1Api
-
-            api = CoreV1Api(client)
-            api.get_api_resources(_request_timeout=3.0)  # Set timeout for this specific request
-
-            # If we get here, the token is valid
-            # Extract user info from the token claims
-            import base64
-
-            # Decode the token (without verification since we've already validated it)
-            token_parts = token.split(".")
-            payload = json.loads(base64.b64decode(token_parts[1] + "=" * (-len(token_parts[1]) % 4)))
-
-            # Extract user information from the token
-            username = payload.get("sub", "")
-            groups = payload.get("groups", [])
-
-            return TokenValidationResult(
-                principal=username,
-                access_attributes=AccessAttributes(
-                    roles=[username],  # Use username as a role
-                    teams=groups,  # Use Kubernetes groups as teams
-                ),
-            )
-
-        except Exception as e:
-            logger.exception("Failed to validate Kubernetes token")
-            raise ValueError("Invalid or expired token") from e
-
-    async def close(self):
-        """Close the HTTP client."""
-        if self._client:
-            self._client.close()
-            self._client = None
-
-
 def get_attributes_from_claims(claims: dict[str, str], mapping: dict[str, str]) -> AccessAttributes:
    attributes = AccessAttributes()
    for claim_key, attribute_key in mapping.items():
@ -198,11 +107,24 @@ def get_attributes_from_claims(claims: dict[str, str], mapping: dict[str, str])
    return attributes


-class OAuth2TokenAuthProviderConfig(BaseModel):
+class OAuth2JWKSConfig(BaseModel):
    # The JWKS URI for collecting public keys
-    jwks_uri: str
-    cache_ttl: int = 3600
+    uri: str
+    key_recheck_period: int = Field(default=3600, description="The period to recheck the JWKS URI for key updates")
+
+
+class OAuth2IntrospectionConfig(BaseModel):
+    url: str
+    client_id: str
+    client_secret: str
+    send_secret_in_body: bool = False
+
+
+class OAuth2TokenAuthProviderConfig(BaseModel):
    audience: str = "llama-stack"
+    verify_tls: bool = True
+    tls_cafile: Path | None = None
+    issuer: str | None = Field(default=None, description="The OIDC issuer URL.")
    claims_mapping: dict[str, str] = Field(
        default_factory=lambda: {
            "sub": "roles",
@ -214,6 +136,8 @@ class OAuth2TokenAuthProviderConfig(BaseModel):
            "namespace": "namespaces",
        },
    )
+    jwks: OAuth2JWKSConfig | None
+    introspection: OAuth2IntrospectionConfig | None = None

    @classmethod
    @field_validator("claims_mapping")
@ -225,6 +149,14 @@ class OAuth2TokenAuthProviderConfig(BaseModel):
                raise ValueError(f"claims_mapping value is not a valid attribute: {value}")
        return v

+    @model_validator(mode="after")
+    def validate_mode(self) -> Self:
+        if not self.jwks and not self.introspection:
+            raise ValueError("One of jwks or introspection must be configured")
+        if self.jwks and self.introspection:
+            raise ValueError("At present only one of jwks or introspection should be configured")
+        return self
+

 class OAuth2TokenAuthProvider(AuthProvider):
    """
@ -240,6 +172,13 @@ class OAuth2TokenAuthProvider(AuthProvider):
        self._jwks_lock = Lock()

    async def validate_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
+        if self.config.jwks:
+            return await self.validate_jwt_token(token, scope)
+        if self.config.introspection:
+            return await self.introspect_token(token, scope)
+        raise ValueError("One of jwks or introspection must be configured")
+
+    async def validate_jwt_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
        """Validate a token using the JWT token."""
        await self._refresh_jwks()

@ -255,7 +194,7 @@ class OAuth2TokenAuthProvider(AuthProvider):
                key_data,
                algorithms=[algorithm],
                audience=self.config.audience,
-                options={"verify_exp": True},
+                issuer=self.config.issuer,
            )
        except Exception as exc:
            raise ValueError(f"Invalid JWT token: {token}") from exc
@ -269,14 +208,75 @@ class OAuth2TokenAuthProvider(AuthProvider):
            access_attributes=access_attributes,
        )

+    async def introspect_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
+        """Validate a token using token introspection as defined by RFC 7662."""
+        form = {
+            "token": token,
+        }
+        if self.config.introspection is None:
+            raise ValueError("Introspection is not configured")
+
+        if self.config.introspection.send_secret_in_body:
+            form["client_id"] = self.config.introspection.client_id
+            form["client_secret"] = self.config.introspection.client_secret
+            auth = None
+        else:
+            auth = (self.config.introspection.client_id, self.config.introspection.client_secret)
+        ssl_ctxt = None
+        if self.config.tls_cafile:
+            ssl_ctxt = ssl.create_default_context(cafile=self.config.tls_cafile.as_posix())
+        try:
+            async with httpx.AsyncClient(verify=ssl_ctxt) as client:
+                response = await client.post(
+                    self.config.introspection.url,
+                    data=form,
+                    auth=auth,
+                    timeout=10.0,  # Add a reasonable timeout
+                )
+                if response.status_code != 200:
+                    logger.warning(f"Token introspection failed with status code: {response.status_code}")
+                    raise ValueError(f"Token introspection failed: {response.status_code}")
+
+                fields = response.json()
+                if not fields["active"]:
+                    raise ValueError("Token not active")
+                principal = fields["sub"] or fields["username"]
+                access_attributes = get_attributes_from_claims(fields, self.config.claims_mapping)
+                return TokenValidationResult(
+                    principal=principal,
+                    access_attributes=access_attributes,
+                )
+        except httpx.TimeoutException:
+            logger.exception("Token introspection request timed out")
+            raise
+        except ValueError:
+            # Re-raise ValueError exceptions to preserve their message
+            raise
+        except Exception as e:
+            logger.exception("Error during token introspection")
+            raise ValueError("Token introspection error") from e
+
    async def close(self):
-        """Close the HTTP client."""
+        pass

    async def _refresh_jwks(self) -> None:
+        """
+        Refresh the JWKS cache.
+
+        This is a simple cache that expires after a certain amount of time (defined by `key_recheck_period`).
+        If the cache is expired, we refresh the JWKS from the JWKS URI.
+
+        Notes: for Kubernetes which doesn't fully implement the OIDC protocol:
+            * It doesn't have user authentication flows
+            * It doesn't have refresh tokens
+        """
        async with self._jwks_lock:
-            if time.time() - self._jwks_at > self.config.cache_ttl:
-                async with httpx.AsyncClient() as client:
-                    res = await client.get(self.config.jwks_uri, timeout=5)
+            if self.config.jwks is None:
+                raise ValueError("JWKS is not configured")
+            if time.time() - self._jwks_at > self.config.jwks.key_recheck_period:
+                verify = self.config.tls_cafile.as_posix() if self.config.tls_cafile else self.config.verify_tls
+                async with httpx.AsyncClient(verify=verify) as client:
+                    res = await client.get(self.config.jwks.uri, timeout=5)
                    res.raise_for_status()
                    jwks_data = res.json()["keys"]
                    updated = {}
@ -363,13 +363,11 @@ class CustomAuthProvider(AuthProvider):
            self._client = None


-def create_auth_provider(config: AuthProviderConfig) -> AuthProvider:
+def create_auth_provider(config: AuthenticationConfig) -> AuthProvider:
    """Factory function to create the appropriate auth provider."""
    provider_type = config.provider_type.lower()

-    if provider_type == "kubernetes":
-        return KubernetesAuthProvider(KubernetesAuthProviderConfig.model_validate(config.config))
-    elif provider_type == "custom":
+    if provider_type == "custom":
        return CustomAuthProvider(CustomAuthProviderConfig.model_validate(config.config))
    elif provider_type == "oauth2_token":
        return OAuth2TokenAuthProvider(OAuth2TokenAuthProviderConfig.model_validate(config.config))
--- a/llama_stack/distribution/server/quota.py
+++ b/llama_stack/distribution/server/quota.py
@ -0,0 +1,110 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import time
+from datetime import datetime, timedelta, timezone
+
+from starlette.types import ASGIApp, Receive, Scope, Send
+
+from llama_stack.log import get_logger
+from llama_stack.providers.utils.kvstore.api import KVStore
+from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
+from llama_stack.providers.utils.kvstore.kvstore import kvstore_impl
+
+logger = get_logger(name=__name__, category="quota")
+
+
+class QuotaMiddleware:
+    """
+    ASGI middleware that enforces separate quotas for authenticated and anonymous clients
+    within a configurable time window.
+
+    - For authenticated requests, it reads the client ID from the
+      `Authorization: Bearer <client_id>` header.
+    - For anonymous requests, it falls back to the IP address of the client.
+    Requests are counted in a KV store (e.g., SQLite), and HTTP 429 is returned
+    once a client exceeds its quota.
+    """
+
+    def __init__(
+        self,
+        app: ASGIApp,
+        kv_config: KVStoreConfig,
+        anonymous_max_requests: int,
+        authenticated_max_requests: int,
+        window_seconds: int = 86400,
+    ):
+        self.app = app
+        self.kv_config = kv_config
+        self.kv: KVStore | None = None
+        self.anonymous_max_requests = anonymous_max_requests
+        self.authenticated_max_requests = authenticated_max_requests
+        self.window_seconds = window_seconds
+
+        if isinstance(self.kv_config, SqliteKVStoreConfig):
+            logger.warning(
+                "QuotaMiddleware: Using SQLite backend. Expiry/TTL is not enforced; cleanup is manual. "
+                f"window_seconds={self.window_seconds}"
+            )
+
+    async def _get_kv(self) -> KVStore:
+        if self.kv is None:
+            self.kv = await kvstore_impl(self.kv_config)
+        return self.kv
+
+    async def __call__(self, scope: Scope, receive: Receive, send: Send):
+        if scope["type"] == "http":
+            # pick key & limit based on auth
+            auth_id = scope.get("authenticated_client_id")
+            if auth_id:
+                key_id = auth_id
+                limit = self.authenticated_max_requests
+            else:
+                # fallback to IP
+                client = scope.get("client")
+                key_id = client[0] if client else "anonymous"
+                limit = self.anonymous_max_requests
+
+            current_window = int(time.time() // self.window_seconds)
+            key = f"quota:{key_id}:{current_window}"
+
+            try:
+                kv = await self._get_kv()
+                prev = await kv.get(key) or "0"
+                count = int(prev) + 1
+
+                if int(prev) == 0:
+                    # Set with expiration datetime when it is the first request in the window.
+                    expiration = datetime.now(timezone.utc) + timedelta(seconds=self.window_seconds)
+                    await kv.set(key, str(count), expiration=expiration)
+                else:
+                    await kv.set(key, str(count))
+            except Exception:
+                logger.exception("Failed to access KV store for quota")
+                return await self._send_error(send, 500, "Quota service error")
+
+            if count > limit:
+                logger.warning(
+                    "Quota exceeded for client %s: %d/%d",
+                    key_id,
+                    count,
+                    limit,
+                )
+                return await self._send_error(send, 429, "Quota exceeded")
+
+        return await self.app(scope, receive, send)
+
+    async def _send_error(self, send: Send, status: int, message: str):
+        await send(
+            {
+                "type": "http.response.start",
+                "status": status,
+                "headers": [[b"content-type", b"application/json"]],
+            }
+        )
+        body = json.dumps({"error": {"message": message}}).encode()
+        await send({"type": "http.response.body", "body": body})
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@ -60,6 +60,7 @@ from llama_stack.providers.utils.telemetry.tracing import (

 from .auth import AuthenticationMiddleware
 from .endpoints import get_all_api_endpoints
+from .quota import QuotaMiddleware

 REPO_ROOT = Path(__file__).parent.parent.parent.parent

@ -434,6 +435,35 @@ def main(args: argparse.Namespace | None = None):
    if config.server.auth:
        logger.info(f"Enabling authentication with provider: {config.server.auth.provider_type.value}")
        app.add_middleware(AuthenticationMiddleware, auth_config=config.server.auth)
+    else:
+        if config.server.quota:
+            quota = config.server.quota
+            logger.warning(
+                "Configured authenticated_max_requests (%d) but no auth is enabled; "
+                "falling back to anonymous_max_requests (%d) for all the requests",
+                quota.authenticated_max_requests,
+                quota.anonymous_max_requests,
+            )
+
+    if config.server.quota:
+        logger.info("Enabling quota middleware for authenticated and anonymous clients")
+
+        quota = config.server.quota
+        anonymous_max_requests = quota.anonymous_max_requests
+        # if auth is disabled, use the anonymous max requests
+        authenticated_max_requests = quota.authenticated_max_requests if config.server.auth else anonymous_max_requests
+
+        kv_config = quota.kvstore
+        window_map = {"day": 86400}
+        window_seconds = window_map[quota.period.value]
+
+        app.add_middleware(
+            QuotaMiddleware,
+            kv_config=kv_config,
+            anonymous_max_requests=anonymous_max_requests,
+            authenticated_max_requests=authenticated_max_requests,
+            window_seconds=window_seconds,
+        )

    try:
        impls = asyncio.run(construct_stack(config))
--- a/pyproject.toml
+++ b/pyproject.toml
@ -40,7 +40,6 @@ dependencies = [
    "tiktoken",
    "pillow",
    "h11>=0.16.0",
-    "kubernetes",
 ]

 [project.optional-dependencies]
@ -94,6 +93,7 @@ test = [
 docs = [
    "sphinx-autobuild",
    "myst-parser",
+    "sphinx",
    "sphinx-rtd-theme",
    "sphinx_rtd_dark_mode",
    "sphinx-copybutton",
@ -103,6 +103,8 @@ docs = [
    "sphinxcontrib.video",
    "sphinxcontrib.mermaid",
    "tomli",
+    "linkify",
+    "sphinxcontrib.openapi",
 ]
 codegen = ["rich", "pydantic", "jinja2>=3.1.6"]
 ui = [
--- a/requirements.txt
+++ b/requirements.txt
@ -4,19 +4,16 @@ annotated-types==0.7.0
 anyio==4.8.0
 attrs==25.1.0
 blobfile==3.0.0
-cachetools==5.5.2
 certifi==2025.1.31
 charset-normalizer==3.4.1
 click==8.1.8
 colorama==0.4.6 ; sys_platform == 'win32'
 distro==1.9.0
-durationpy==0.9
 ecdsa==0.19.1
 exceptiongroup==1.2.2 ; python_full_version < '3.11'
 filelock==3.17.0
 fire==0.7.0
 fsspec==2024.12.0
-google-auth==2.38.0
 h11==0.16.0
 httpcore==1.0.9
 httpx==0.28.1
@ -26,14 +23,12 @@ jinja2==3.1.6
 jiter==0.8.2
 jsonschema==4.23.0
 jsonschema-specifications==2024.10.1
-kubernetes==32.0.1
 llama-stack-client==0.2.7
 lxml==5.3.1
 markdown-it-py==3.0.0
 markupsafe==3.0.2
 mdurl==0.1.2
 numpy==2.2.3
-oauthlib==3.2.2
 openai==1.71.0
 packaging==24.2
 pandas==2.2.3
@ -41,7 +36,6 @@ pillow==11.1.0
 prompt-toolkit==3.0.50
 pyaml==25.1.0
 pyasn1==0.4.8
-pyasn1-modules==0.4.1
 pycryptodomex==3.21.0
 pydantic==2.10.6
 pydantic-core==2.27.2
@ -54,7 +48,6 @@ pyyaml==6.0.2
 referencing==0.36.2
 regex==2024.11.6
 requests==2.32.3
-requests-oauthlib==2.0.0
 rich==13.9.4
 rpds-py==0.22.3
 rsa==4.9
@ -68,4 +61,3 @@ typing-extensions==4.12.2
 tzdata==2025.1
 urllib3==2.3.0
 wcwidth==0.2.13
-websocket-client==1.8.0
--- a/tests/unit/server/test_auth.py
+++ b/tests/unit/server/test_auth.py
@ -11,12 +11,10 @@ import pytest
 from fastapi import FastAPI
 from fastapi.testclient import TestClient

-from llama_stack.distribution.datatypes import AccessAttributes
+from llama_stack.distribution.datatypes import AuthenticationConfig
 from llama_stack.distribution.server.auth import AuthenticationMiddleware
 from llama_stack.distribution.server.auth_providers import (
-    AuthProviderConfig,
    AuthProviderType,
-    TokenValidationResult,
    get_attributes_from_claims,
 )

@ -62,7 +60,7 @@ def invalid_token():
@pytest.fixture
 def http_app(mock_auth_endpoint):
    app = FastAPI()
-    auth_config = AuthProviderConfig(
+    auth_config = AuthenticationConfig(
        provider_type=AuthProviderType.CUSTOM,
        config={"endpoint": mock_auth_endpoint},
    )
@ -78,7 +76,7 @@ def http_app(mock_auth_endpoint):
@pytest.fixture
 def k8s_app():
    app = FastAPI()
-    auth_config = AuthProviderConfig(
+    auth_config = AuthenticationConfig(
        provider_type=AuthProviderType.KUBERNETES,
        config={"api_server_url": "https://kubernetes.default.svc"},
    )
@ -118,7 +116,7 @@ def mock_scope():
@pytest.fixture
 def mock_http_middleware(mock_auth_endpoint):
    mock_app = AsyncMock()
-    auth_config = AuthProviderConfig(
+    auth_config = AuthenticationConfig(
        provider_type=AuthProviderType.CUSTOM,
        config={"endpoint": mock_auth_endpoint},
    )
@ -128,7 +126,7 @@ def mock_http_middleware(mock_auth_endpoint):
@pytest.fixture
 def mock_k8s_middleware():
    mock_app = AsyncMock()
-    auth_config = AuthProviderConfig(
+    auth_config = AuthenticationConfig(
        provider_type=AuthProviderType.KUBERNETES,
        config={"api_server_url": "https://kubernetes.default.svc"},
    )
@ -284,120 +282,19 @@ async def test_http_middleware_no_attributes(mock_http_middleware, mock_scope):
        assert attributes["roles"] == ["test.jwt.token"]


-# Kubernetes Tests
-def test_missing_auth_header_k8s(k8s_client):
-    response = k8s_client.get("/test")
-    assert response.status_code == 401
-    assert "Missing or invalid Authorization header" in response.json()["error"]["message"]
-
-
-def test_invalid_auth_header_format_k8s(k8s_client):
-    response = k8s_client.get("/test", headers={"Authorization": "InvalidFormat token123"})
-    assert response.status_code == 401
-    assert "Missing or invalid Authorization header" in response.json()["error"]["message"]
-
-
-@patch("kubernetes.client.ApiClient")
-def test_valid_k8s_authentication(mock_api_client, k8s_client, valid_token):
-    # Mock the Kubernetes client
-    mock_client = AsyncMock()
-    mock_api_client.return_value = mock_client
-
-    # Mock successful token validation
-    mock_client.set_default_header = AsyncMock()
-
-    # Mock the token validation to return valid access attributes
-    with patch("llama_stack.distribution.server.auth_providers.KubernetesAuthProvider.validate_token") as mock_validate:
-        mock_validate.return_value = TokenValidationResult(
-            principal="test-principal",
-            access_attributes=AccessAttributes(
-                roles=["admin"], teams=["ml-team"], projects=["llama-3"], namespaces=["research"]
-            ),
-        )
-        response = k8s_client.get("/test", headers={"Authorization": f"Bearer {valid_token}"})
-        assert response.status_code == 200
-        assert response.json() == {"message": "Authentication successful"}
-
-
-@patch("kubernetes.client.ApiClient")
-def test_invalid_k8s_authentication(mock_api_client, k8s_client, invalid_token):
-    # Mock the Kubernetes client
-    mock_client = AsyncMock()
-    mock_api_client.return_value = mock_client
-
-    # Mock failed token validation by raising an exception
-    with patch("llama_stack.distribution.server.auth_providers.KubernetesAuthProvider.validate_token") as mock_validate:
-        mock_validate.side_effect = ValueError("Invalid or expired token")
-        response = k8s_client.get("/test", headers={"Authorization": f"Bearer {invalid_token}"})
-        assert response.status_code == 401
-        assert "Invalid or expired token" in response.json()["error"]["message"]
-
-
-@pytest.mark.asyncio
-async def test_k8s_middleware_with_access_attributes(mock_k8s_middleware, mock_scope):
-    middleware, mock_app = mock_k8s_middleware
-    mock_receive = AsyncMock()
-    mock_send = AsyncMock()
-
-    with patch("kubernetes.client.ApiClient") as mock_api_client:
-        mock_client = AsyncMock()
-        mock_api_client.return_value = mock_client
-
-        # Mock successful token validation
-        mock_client.set_default_header = AsyncMock()
-
-        # Mock token payload with access attributes
-        mock_token_parts = ["header", "eyJzdWIiOiJhZG1pbiIsImdyb3VwcyI6WyJtbC10ZWFtIl19", "signature"]
-        mock_scope["headers"][1] = (b"authorization", f"Bearer {'.'.join(mock_token_parts)}".encode())
-
-        await middleware(mock_scope, mock_receive, mock_send)
-
-        assert "user_attributes" in mock_scope
-        assert mock_scope["user_attributes"]["roles"] == ["admin"]
-        assert mock_scope["user_attributes"]["teams"] == ["ml-team"]
-
-        mock_app.assert_called_once_with(mock_scope, mock_receive, mock_send)
-
-
-@pytest.mark.asyncio
-async def test_k8s_middleware_no_attributes(mock_k8s_middleware, mock_scope):
-    """Test middleware behavior with no access attributes"""
-    middleware, mock_app = mock_k8s_middleware
-    mock_receive = AsyncMock()
-    mock_send = AsyncMock()
-
-    with patch("kubernetes.client.ApiClient") as mock_api_client:
-        mock_client = AsyncMock()
-        mock_api_client.return_value = mock_client
-
-        # Mock successful token validation
-        mock_client.set_default_header = AsyncMock()
-
-        # Mock token payload without access attributes
-        mock_token_parts = ["header", "eyJzdWIiOiJhZG1pbiJ9", "signature"]
-        mock_scope["headers"][1] = (b"authorization", f"Bearer {'.'.join(mock_token_parts)}".encode())
-
-        await middleware(mock_scope, mock_receive, mock_send)
-
-        assert "user_attributes" in mock_scope
-        attributes = mock_scope["user_attributes"]
-        assert "roles" in attributes
-        assert attributes["roles"] == ["admin"]
-
-        mock_app.assert_called_once_with(mock_scope, mock_receive, mock_send)
-
-
 # oauth2 token provider tests


@pytest.fixture
 def oauth2_app():
    app = FastAPI()
-    auth_config = AuthProviderConfig(
+    auth_config = AuthenticationConfig(
        provider_type=AuthProviderType.OAUTH2_TOKEN,
        config={
-            "jwks_uri": "http://mock-authz-service/token/introspect",
-            "cache_ttl": "3600",
+            "jwks": {
+                "uri": "http://mock-authz-service/token/introspect",
+                "key_recheck_period": "3600",
+            },
            "audience": "llama-stack",
        },
    )
@ -517,3 +414,159 @@ def test_get_attributes_from_claims():


 # TODO: add more tests for oauth2 token provider
+
+
+# oauth token introspection tests
+@pytest.fixture
+def mock_introspection_endpoint():
+    return "http://mock-authz-service/token/introspect"
+
+
+@pytest.fixture
+def introspection_app(mock_introspection_endpoint):
+    app = FastAPI()
+    auth_config = AuthenticationConfig(
+        provider_type=AuthProviderType.OAUTH2_TOKEN,
+        config={
+            "jwks": None,
+            "introspection": {"url": mock_introspection_endpoint, "client_id": "myclient", "client_secret": "abcdefg"},
+        },
+    )
+    app.add_middleware(AuthenticationMiddleware, auth_config=auth_config)
+
+    @app.get("/test")
+    def test_endpoint():
+        return {"message": "Authentication successful"}
+
+    return app
+
+
+@pytest.fixture
+def introspection_app_with_custom_mapping(mock_introspection_endpoint):
+    app = FastAPI()
+    auth_config = AuthenticationConfig(
+        provider_type=AuthProviderType.OAUTH2_TOKEN,
+        config={
+            "jwks": None,
+            "introspection": {
+                "url": mock_introspection_endpoint,
+                "client_id": "myclient",
+                "client_secret": "abcdefg",
+                "send_secret_in_body": "true",
+            },
+            "claims_mapping": {
+                "sub": "roles",
+                "scope": "roles",
+                "groups": "teams",
+                "aud": "namespaces",
+            },
+        },
+    )
+    app.add_middleware(AuthenticationMiddleware, auth_config=auth_config)
+
+    @app.get("/test")
+    def test_endpoint():
+        return {"message": "Authentication successful"}
+
+    return app
+
+
+@pytest.fixture
+def introspection_client(introspection_app):
+    return TestClient(introspection_app)
+
+
+@pytest.fixture
+def introspection_client_with_custom_mapping(introspection_app_with_custom_mapping):
+    return TestClient(introspection_app_with_custom_mapping)
+
+
+def test_missing_auth_header_introspection(introspection_client):
+    response = introspection_client.get("/test")
+    assert response.status_code == 401
+    assert "Missing or invalid Authorization header" in response.json()["error"]["message"]
+
+
+def test_invalid_auth_header_format_introspection(introspection_client):
+    response = introspection_client.get("/test", headers={"Authorization": "InvalidFormat token123"})
+    assert response.status_code == 401
+    assert "Missing or invalid Authorization header" in response.json()["error"]["message"]
+
+
+async def mock_introspection_active(*args, **kwargs):
+    return MockResponse(
+        200,
+        {
+            "active": True,
+            "sub": "my-user",
+            "groups": ["group1", "group2"],
+            "scope": "foo bar",
+            "aud": ["set1", "set2"],
+        },
+    )
+
+
+async def mock_introspection_inactive(*args, **kwargs):
+    return MockResponse(
+        200,
+        {
+            "active": False,
+        },
+    )
+
+
+async def mock_introspection_invalid(*args, **kwargs):
+    class InvalidResponse:
+        def __init__(self, status_code):
+            self.status_code = status_code
+
+        def json(self):
+            raise ValueError("Not JSON")
+
+    return InvalidResponse(200)
+
+
+async def mock_introspection_failed(*args, **kwargs):
+    return MockResponse(
+        500,
+        {},
+    )
+
+
+@patch("httpx.AsyncClient.post", new=mock_introspection_active)
+def test_valid_introspection_authentication(introspection_client, valid_api_key):
+    response = introspection_client.get("/test", headers={"Authorization": f"Bearer {valid_api_key}"})
+    assert response.status_code == 200
+    assert response.json() == {"message": "Authentication successful"}
+
+
+@patch("httpx.AsyncClient.post", new=mock_introspection_inactive)
+def test_inactive_introspection_authentication(introspection_client, invalid_api_key):
+    response = introspection_client.get("/test", headers={"Authorization": f"Bearer {invalid_api_key}"})
+    assert response.status_code == 401
+    assert "Token not active" in response.json()["error"]["message"]
+
+
+@patch("httpx.AsyncClient.post", new=mock_introspection_invalid)
+def test_invalid_introspection_authentication(introspection_client, invalid_api_key):
+    response = introspection_client.get("/test", headers={"Authorization": f"Bearer {invalid_api_key}"})
+    assert response.status_code == 401
+    assert "Not JSON" in response.json()["error"]["message"]
+
+
+@patch("httpx.AsyncClient.post", new=mock_introspection_failed)
+def test_failed_introspection_authentication(introspection_client, invalid_api_key):
+    response = introspection_client.get("/test", headers={"Authorization": f"Bearer {invalid_api_key}"})
+    assert response.status_code == 401
+    assert "Token introspection failed: 500" in response.json()["error"]["message"]
+
+
+@patch("httpx.AsyncClient.post", new=mock_introspection_active)
+def test_valid_introspection_with_custom_mapping_authentication(
+    introspection_client_with_custom_mapping, valid_api_key
+):
+    response = introspection_client_with_custom_mapping.get(
+        "/test", headers={"Authorization": f"Bearer {valid_api_key}"}
+    )
+    assert response.status_code == 200
+    assert response.json() == {"message": "Authentication successful"}
--- a/tests/unit/server/test_quota.py
+++ b/tests/unit/server/test_quota.py
@ -0,0 +1,127 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import pytest
+from fastapi import FastAPI, Request
+from fastapi.testclient import TestClient
+from starlette.middleware.base import BaseHTTPMiddleware
+
+from llama_stack.distribution.datatypes import QuotaConfig, QuotaPeriod
+from llama_stack.distribution.server.quota import QuotaMiddleware
+from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
+
+
+class InjectClientIDMiddleware(BaseHTTPMiddleware):
+    """
+    Middleware that injects 'authenticated_client_id' to mimic AuthenticationMiddleware.
+    """
+
+    def __init__(self, app, client_id="client1"):
+        super().__init__(app)
+        self.client_id = client_id
+
+    async def dispatch(self, request: Request, call_next):
+        request.scope["authenticated_client_id"] = self.client_id
+        return await call_next(request)
+
+
+def build_quota_config(db_path) -> QuotaConfig:
+    return QuotaConfig(
+        kvstore=SqliteKVStoreConfig(db_path=str(db_path)),
+        anonymous_max_requests=1,
+        authenticated_max_requests=2,
+        period=QuotaPeriod.DAY,
+    )
+
+
+@pytest.fixture
+def auth_app(tmp_path, request):
+    """
+    FastAPI app with InjectClientIDMiddleware and QuotaMiddleware for authenticated testing.
+    Each test gets its own DB file.
+    """
+    inner_app = FastAPI()
+
+    @inner_app.get("/test")
+    async def test_endpoint():
+        return {"message": "ok"}
+
+    db_path = tmp_path / f"quota_{request.node.name}.db"
+    quota = build_quota_config(db_path)
+
+    app = InjectClientIDMiddleware(
+        QuotaMiddleware(
+            inner_app,
+            kv_config=quota.kvstore,
+            anonymous_max_requests=quota.anonymous_max_requests,
+            authenticated_max_requests=quota.authenticated_max_requests,
+            window_seconds=86400,
+        ),
+        client_id=f"client_{request.node.name}",
+    )
+    return app
+
+
+def test_authenticated_quota_allows_up_to_limit(auth_app):
+    client = TestClient(auth_app)
+    assert client.get("/test").status_code == 200
+    assert client.get("/test").status_code == 200
+
+
+def test_authenticated_quota_blocks_after_limit(auth_app):
+    client = TestClient(auth_app)
+    client.get("/test")
+    client.get("/test")
+    resp = client.get("/test")
+    assert resp.status_code == 429
+    assert resp.json()["error"]["message"] == "Quota exceeded"
+
+
+def test_anonymous_quota_allows_up_to_limit(tmp_path, request):
+    inner_app = FastAPI()
+
+    @inner_app.get("/test")
+    async def test_endpoint():
+        return {"message": "ok"}
+
+    db_path = tmp_path / f"quota_anon_{request.node.name}.db"
+    quota = build_quota_config(db_path)
+
+    app = QuotaMiddleware(
+        inner_app,
+        kv_config=quota.kvstore,
+        anonymous_max_requests=quota.anonymous_max_requests,
+        authenticated_max_requests=quota.authenticated_max_requests,
+        window_seconds=86400,
+    )
+
+    client = TestClient(app)
+    assert client.get("/test").status_code == 200
+
+
+def test_anonymous_quota_blocks_after_limit(tmp_path, request):
+    inner_app = FastAPI()
+
+    @inner_app.get("/test")
+    async def test_endpoint():
+        return {"message": "ok"}
+
+    db_path = tmp_path / f"quota_anon_{request.node.name}.db"
+    quota = build_quota_config(db_path)
+
+    app = QuotaMiddleware(
+        inner_app,
+        kv_config=quota.kvstore,
+        anonymous_max_requests=quota.anonymous_max_requests,
+        authenticated_max_requests=quota.authenticated_max_requests,
+        window_seconds=86400,
+    )
+
+    client = TestClient(app)
+    client.get("/test")
+    resp = client.get("/test")
+    assert resp.status_code == 429
+    assert resp.json()["error"]["message"] == "Quota exceeded"
--- a/uv.lock
+++ b/uv.lock
@ -628,6 +628,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186", size = 9073 },
 ]

+[[package]]
+name = "deepmerge"
+version = "2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a8/3a/b0ba594708f1ad0bc735884b3ad854d3ca3bdc1d741e56e40bbda6263499/deepmerge-2.0.tar.gz", hash = "sha256:5c3d86081fbebd04dd5de03626a0607b809a98fb6ccba5770b62466fe940ff20", size = 19890 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2d/82/e5d2c1c67d19841e9edc74954c827444ae826978499bde3dfc1d007c8c11/deepmerge-2.0-py3-none-any.whl", hash = "sha256:6de9ce507115cff0bed95ff0ce9ecc31088ef50cbdf09bc90a09349a318b3d00", size = 13475 },
+]
+
 [[package]]
 name = "deprecated"
 version = "1.2.18"
@ -676,15 +685,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408 },
 ]

-[[package]]
-name = "durationpy"
-version = "0.9"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/31/e9/f49c4e7fccb77fa5c43c2480e09a857a78b41e7331a75e128ed5df45c56b/durationpy-0.9.tar.gz", hash = "sha256:fd3feb0a69a0057d582ef643c355c40d2fa1c942191f914d12203b1a01ac722a", size = 3186 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/4c/a3/ac312faeceffd2d8f86bc6dcb5c401188ba5a01bc88e69bed97578a0dfcd/durationpy-0.9-py3-none-any.whl", hash = "sha256:e65359a7af5cedad07fb77a2dd3f390f8eb0b74cb845589fa6c057086834dd38", size = 3461 },
-]
-
 [[package]]
 name = "ecdsa"
 version = "0.19.1"
@ -863,20 +863,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/1d/9a/4114a9057db2f1462d5c8f8390ab7383925fe1ac012eaa42402ad65c2963/GitPython-3.1.44-py3-none-any.whl", hash = "sha256:9e0e10cda9bed1ee64bc9a6de50e7e38a9c9943241cd7f585f6df3ed28011110", size = 207599 },
 ]

-[[package]]
-name = "google-auth"
-version = "2.38.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "cachetools" },
-    { name = "pyasn1-modules" },
-    { name = "rsa" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/c6/eb/d504ba1daf190af6b204a9d4714d457462b486043744901a6eeea711f913/google_auth-2.38.0.tar.gz", hash = "sha256:8285113607d3b80a3f1543b75962447ba8a09fe85783432a784fdeef6ac094c4", size = 270866 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9d/47/603554949a37bca5b7f894d51896a9c534b9eab808e2520a748e081669d0/google_auth-2.38.0-py2.py3-none-any.whl", hash = "sha256:e7dae6694313f434a2727bf2906f27ad259bae090d7aa896590d86feec3d9d4a", size = 210770 },
-]
-
 [[package]]
 name = "googleapis-common-protos"
 version = "1.67.0"
@ -1324,28 +1310,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/c9/fb/108ecd1fe961941959ad0ee4e12ee7b8b1477247f30b1fdfd83ceaf017f0/jupyter_core-5.7.2-py3-none-any.whl", hash = "sha256:4f7315d2f6b4bcf2e3e7cb6e46772eba760ae459cd1f59d29eb57b0a01bd7409", size = 28965 },
 ]

-[[package]]
-name = "kubernetes"
-version = "32.0.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "certifi" },
-    { name = "durationpy" },
-    { name = "google-auth" },
-    { name = "oauthlib" },
-    { name = "python-dateutil" },
-    { name = "pyyaml" },
-    { name = "requests" },
-    { name = "requests-oauthlib" },
-    { name = "six" },
-    { name = "urllib3" },
-    { name = "websocket-client" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b7/e8/0598f0e8b4af37cd9b10d8b87386cf3173cb8045d834ab5f6ec347a758b3/kubernetes-32.0.1.tar.gz", hash = "sha256:42f43d49abd437ada79a79a16bd48a604d3471a117a8347e87db693f2ba0ba28", size = 946691 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/08/10/9f8af3e6f569685ce3af7faab51c8dd9d93b9c38eba339ca31c746119447/kubernetes-32.0.1-py2.py3-none-any.whl", hash = "sha256:35282ab8493b938b08ab5526c7ce66588232df00ef5e1dbe88a419107dc10998", size = 1988070 },
-]
-
 [[package]]
 name = "levenshtein"
 version = "0.27.1"
@ -1429,6 +1393,12 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/dc/1e/408fd10217eac0e43aea0604be22b4851a09e03d761d44d4ea12089dd70e/levenshtein-0.27.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:7987ef006a3cf56a4532bd4c90c2d3b7b4ca9ad3bf8ae1ee5713c4a3bdfda913", size = 98045 },
 ]

+[[package]]
+name = "linkify"
+version = "1.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/65/c6/246100fa3967074d9725b3716913bd495823547bde5047050d4c3462f994/linkify-1.4.tar.gz", hash = "sha256:9ba276ba179525f7262820d90f009604e51cd4f1466c1112b882ef7eda243d5e", size = 1749 }
+
 [[package]]
 name = "llama-stack"
 version = "0.2.7"
@ -1441,7 +1411,6 @@ dependencies = [
    { name = "huggingface-hub" },
    { name = "jinja2" },
    { name = "jsonschema" },
-    { name = "kubernetes" },
    { name = "llama-stack-client" },
    { name = "openai" },
    { name = "pillow" },
@ -1480,7 +1449,9 @@ dev = [
    { name = "uvicorn" },
 ]
 docs = [
+    { name = "linkify" },
    { name = "myst-parser" },
+    { name = "sphinx" },
    { name = "sphinx-autobuild" },
    { name = "sphinx-copybutton" },
    { name = "sphinx-design" },
@ -1488,6 +1459,7 @@ docs = [
    { name = "sphinx-rtd-theme" },
    { name = "sphinx-tabs" },
    { name = "sphinxcontrib-mermaid" },
+    { name = "sphinxcontrib-openapi" },
    { name = "sphinxcontrib-redoc" },
    { name = "sphinxcontrib-video" },
    { name = "tomli" },
@ -1546,7 +1518,7 @@ requires-dist = [
    { name = "jinja2", specifier = ">=3.1.6" },
    { name = "jinja2", marker = "extra == 'codegen'", specifier = ">=3.1.6" },
    { name = "jsonschema" },
-    { name = "kubernetes" },
+    { name = "linkify", marker = "extra == 'docs'" },
    { name = "llama-stack-client", specifier = ">=0.2.7" },
    { name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.7" },
    { name = "mcp", marker = "extra == 'test'" },
@ -1581,6 +1553,7 @@ requires-dist = [
    { name = "ruamel-yaml", marker = "extra == 'dev'" },
    { name = "ruff", marker = "extra == 'dev'" },
    { name = "setuptools" },
+    { name = "sphinx", marker = "extra == 'docs'" },
    { name = "sphinx-autobuild", marker = "extra == 'docs'" },
    { name = "sphinx-copybutton", marker = "extra == 'docs'" },
    { name = "sphinx-design", marker = "extra == 'docs'" },
@ -1588,6 +1561,7 @@ requires-dist = [
    { name = "sphinx-rtd-theme", marker = "extra == 'docs'" },
    { name = "sphinx-tabs", marker = "extra == 'docs'" },
    { name = "sphinxcontrib-mermaid", marker = "extra == 'docs'" },
+    { name = "sphinxcontrib-openapi", marker = "extra == 'docs'" },
    { name = "sphinxcontrib-redoc", marker = "extra == 'docs'" },
    { name = "sphinxcontrib-video", marker = "extra == 'docs'" },
    { name = "sqlite-vec", marker = "extra == 'unit'" },
@ -1624,9 +1598,9 @@ dependencies = [
    { name = "tqdm" },
    { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/cd/6b/31c07396c5b3010668e4eb38061a96ffacb47ec4b14d8aeb64c13856c485/llama_stack_client-0.2.7.tar.gz", hash = "sha256:11aee11fdd5e0e8caad07c0cce9c4d88640938844372e7e3453a91ea0757fcb3", size = 259273, upload-time = "2025-05-16T20:31:39.221Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/cd/6b/31c07396c5b3010668e4eb38061a96ffacb47ec4b14d8aeb64c13856c485/llama_stack_client-0.2.7.tar.gz", hash = "sha256:11aee11fdd5e0e8caad07c0cce9c4d88640938844372e7e3453a91ea0757fcb3", size = 259273 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ac/69/6a5f4683afe355500df4376fdcbfb2fc1e6a0c3bcea5ff8f6114773a9acf/llama_stack_client-0.2.7-py3-none-any.whl", hash = "sha256:78b3f2abdb1770c7b1270a9c0ef58402a988401c564d2e6c83588779ac6fc38d", size = 292727, upload-time = "2025-05-16T20:31:37.587Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/69/6a5f4683afe355500df4376fdcbfb2fc1e6a0c3bcea5ff8f6114773a9acf/llama_stack_client-0.2.7-py3-none-any.whl", hash = "sha256:78b3f2abdb1770c7b1270a9c0ef58402a988401c564d2e6c83588779ac6fc38d", size = 292727 },
 ]

 [[package]]
@ -1833,6 +1807,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
 ]

+[[package]]
+name = "mistune"
+version = "3.1.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c4/79/bda47f7dd7c3c55770478d6d02c9960c430b0cf1773b72366ff89126ea31/mistune-3.1.3.tar.gz", hash = "sha256:a7035c21782b2becb6be62f8f25d3df81ccb4d6fa477a6525b15af06539f02a0", size = 94347 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/01/4d/23c4e4f09da849e127e9f123241946c23c1e30f45a88366879e064211815/mistune-3.1.3-py3-none-any.whl", hash = "sha256:1a32314113cff28aa6432e99e522677c8587fd83e3d51c29b82a52409c842bd9", size = 53410 },
+]
+
 [[package]]
 name = "mpmath"
 version = "1.3.0"
@ -2087,15 +2073,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/17/7f/d322a4125405920401450118dbdc52e0384026bd669939484670ce8b2ab9/numpy-2.2.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:783145835458e60fa97afac25d511d00a1eca94d4a8f3ace9fe2043003c678e4", size = 12839607 },
 ]

-[[package]]
-name = "oauthlib"
-version = "3.2.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/6d/fa/fbf4001037904031639e6bfbfc02badfc7e12f137a8afa254df6c4c8a670/oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918", size = 177352 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/7e/80/cab10959dc1faead58dc8384a781dfbf93cb4d33d50988f7a69f1b7c9bbe/oauthlib-3.2.2-py3-none-any.whl", hash = "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca", size = 151688 },
-]
-
 [[package]]
 name = "openai"
 version = "1.71.0"
@ -2284,6 +2261,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772 },
 ]

+[[package]]
+name = "picobox"
+version = "4.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0f/b1/830714dd6778c1cb45826722b4e9bd21c94b33cca5df9ef2cc0b80c81b25/picobox-4.0.0.tar.gz", hash = "sha256:114da1b5606b2f615e8b0eb68d04198ad9de75af5adbcf5b36fe4f664ab927b6", size = 22666 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2d/c6/fd64ffd75d47c4fcf6c65808cc5c5c75e5d4357c197d3741ee1339e91257/picobox-4.0.0-py3-none-any.whl", hash = "sha256:4c27eb689fe45dabd9e64c382e04418147d0b746d155b4e80057dbb7ff82027e", size = 11641 },
+]
+
 [[package]]
 name = "pillow"
 version = "11.1.0"
@ -2608,18 +2594,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/62/1e/a94a8d635fa3ce4cfc7f506003548d0a2447ae76fd5ca53932970fe3053f/pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d", size = 77145 },
 ]

-[[package]]
-name = "pyasn1-modules"
-version = "0.4.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "pyasn1" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/1d/67/6afbf0d507f73c32d21084a79946bfcfca5fbc62a72057e9c23797a737c9/pyasn1_modules-0.4.1.tar.gz", hash = "sha256:c28e2dbf9c06ad61c71a075c7e0f9fd0f1b0bb2d2ad4377f240d33ac2ab60a7c", size = 310028 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/77/89/bc88a6711935ba795a679ea6ebee07e128050d6382eaa35a0a47c8032bdc/pyasn1_modules-0.4.1-py3-none-any.whl", hash = "sha256:49bfa96b45a292b711e986f222502c1c9a5e1f4e568fc30e2574a6c7d07838fd", size = 181537 },
-]
-
 [[package]]
 name = "pycparser"
 version = "2.22"
@ -2875,9 +2849,9 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "pytest" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ac/82/4c9ecabab13363e72d880f2fb504c5f750433b2b6f16e99f4ec21ada284c/pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a", size = 17973, upload-time = "2025-05-05T19:44:34.99Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/ac/82/4c9ecabab13363e72d880f2fb504c5f750433b2b6f16e99f4ec21ada284c/pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a", size = 17973 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/fa/b6/3127540ecdf1464a00e5a01ee60a1b09175f6913f0644ac748494d9c4b21/pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2", size = 14382, upload-time = "2025-05-05T19:44:33.502Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/b6/3127540ecdf1464a00e5a01ee60a1b09175f6913f0644ac748494d9c4b21/pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2", size = 14382 },
 ]

 [[package]]
@ -3256,19 +3230,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 },
 ]

-[[package]]
-name = "requests-oauthlib"
-version = "2.0.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "oauthlib" },
-    { name = "requests" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/42/f2/05f29bc3913aea15eb670be136045bf5c5bbf4b99ecb839da9b422bb2c85/requests-oauthlib-2.0.0.tar.gz", hash = "sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9", size = 55650 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36", size = 24179 },
-]
-
 [[package]]
 name = "rich"
 version = "13.9.4"
@ -3597,6 +3558,21 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/c6/43/65c0acbd8cc6f50195a3a1fc195c404988b15c67090e73c7a41a9f57d6bd/sphinx_design-0.6.1-py3-none-any.whl", hash = "sha256:b11f37db1a802a183d61b159d9a202314d4d2fe29c163437001324fe2f19549c", size = 2215338 },
 ]

+[[package]]
+name = "sphinx-mdinclude"
+version = "0.6.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "docutils" },
+    { name = "mistune" },
+    { name = "pygments" },
+    { name = "sphinx" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b6/a7/c9a7888bb2187fdb06955d71e75f6f266b7e179b356ac76138d160a5b7eb/sphinx_mdinclude-0.6.2.tar.gz", hash = "sha256:447462e82cb8be61404a2204227f920769eb923d2f57608e3325f3bb88286b4c", size = 65257 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/42/3d/6b41fe1637cd53c4b10d56e0e6f396546f837973dabf9c4b2a1de44620ac/sphinx_mdinclude-0.6.2-py3-none-any.whl", hash = "sha256:648e78edb067c0e4bffc22943278d49d54a0714494743592032fa3ad82a86984", size = 16911 },
+]
+
 [[package]]
 name = "sphinx-rtd-dark-mode"
 version = "1.3.0"
@ -3664,6 +3640,19 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8", size = 98705 },
 ]

+[[package]]
+name = "sphinxcontrib-httpdomain"
+version = "1.8.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "six" },
+    { name = "sphinx" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/be/ef/82d3cfafb7febce4f7df8dcf3cde9d072350b41066e05a4f559b4e9105d0/sphinxcontrib-httpdomain-1.8.1.tar.gz", hash = "sha256:6c2dfe6ca282d75f66df333869bb0ce7331c01b475db6809ff9d107b7cdfe04b", size = 19266 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/02/49/aad47b8cf27a0d7703f1311aad8c368bb22866ddee1a2d2cd3f69bc45e0c/sphinxcontrib_httpdomain-1.8.1-py2.py3-none-any.whl", hash = "sha256:21eefe1270e4d9de8d717cc89ee92cc4871b8736774393bafc5e38a6bb77b1d5", size = 25513 },
+]
+
 [[package]]
 name = "sphinxcontrib-jquery"
 version = "4.1"
@ -3698,6 +3687,24 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/cd/c8/784b9ac6ea08aa594c1a4becbd0dbe77186785362e31fd633b8c6ae0197a/sphinxcontrib_mermaid-1.0.0-py3-none-any.whl", hash = "sha256:60b72710ea02087f212028feb09711225fbc2e343a10d34822fe787510e1caa3", size = 9597 },
 ]

+[[package]]
+name = "sphinxcontrib-openapi"
+version = "0.8.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "deepmerge" },
+    { name = "jsonschema" },
+    { name = "picobox" },
+    { name = "pyyaml" },
+    { name = "sphinx" },
+    { name = "sphinx-mdinclude" },
+    { name = "sphinxcontrib-httpdomain" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c0/a7/66a5c9aba7dbbb0c2b050f60e71402818cbf5f127ace13ed971029cc745e/sphinxcontrib-openapi-0.8.4.tar.gz", hash = "sha256:df883808a5b5e4b4113ad697185c43a3f42df3dce70453af78ba7076907e9a20", size = 71848 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d5/c3/ee00486f38d78309a60ee0d6031b2545b22ac5f0007d841dd174abc68774/sphinxcontrib_openapi-0.8.4-py3-none-any.whl", hash = "sha256:50911c18d452d9390ee3a384ef8dc8bde6135f542ba55691f81e1fbc0b71014e", size = 34510 },
+]
+
 [[package]]
 name = "sphinxcontrib-qthelp"
 version = "2.0.0"
@ -4323,15 +4330,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166 },
 ]

-[[package]]
-name = "websocket-client"
-version = "1.8.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e6/30/fba0d96b4b5fbf5948ed3f4681f7da2f9f64512e1d303f94b4cc174c24a5/websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da", size = 54648 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/5a/84/44687a29792a70e111c5c477230a72c4b957d88d16141199bf9acb7537a3/websocket_client-1.8.0-py3-none-any.whl", hash = "sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526", size = 58826 },
-]
-
 [[package]]
 name = "websockets"
 version = "15.0"