Merge branch 'main' into nvidia-e2e-notebook

2025-12-17 05:31:23 +00:00 · 2025-04-30 12:05:11 -04:00 · 2025-04-30 12:05:11 -04:00 · 012dd6891f
commit 012dd6891f
parent bfbaf09fa8 eab550f7d2
96 changed files with 4675 additions and 426 deletions
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -0,0 +1,136 @@
+name: Integration Auth Tests
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'distributions/**'
+      - 'llama_stack/**'
+      - 'tests/integration/**'
+      - 'uv.lock'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - '.github/workflows/integration-auth-tests.yml' # This workflow
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test-matrix:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        auth-provider: [kubernetes]
+      fail-fast: false # we want to run all tests regardless of failure
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          python-version: "3.10"
+
+      - name: Set Up Environment and Install Dependencies
+        run: |
+          uv sync --extra dev --extra test
+          uv pip install -e .
+          llama stack build --template ollama --image-type venv
+
+      - name: Install minikube
+        if: ${{ matrix.auth-provider == 'kubernetes' }}
+        uses: medyagh/setup-minikube@latest
+
+      - name: Start minikube
+        if: ${{ matrix.auth-provider == 'kubernetes' }}
+        run: |
+          minikube start
+          kubectl get pods -A
+
+      - name: Configure Kube Auth
+        if: ${{ matrix.auth-provider == 'kubernetes' }}
+        run: |
+          kubectl create namespace llama-stack
+          kubectl create serviceaccount llama-stack-auth -n llama-stack
+          kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --serviceaccount=llama-stack:llama-stack-auth -n llama-stack
+          kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
+
+      - name: Set Kubernetes Config
+        if: ${{ matrix.auth-provider == 'kubernetes' }}
+        run: |
+          echo "KUBERNETES_API_SERVER_URL=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.server}')" >> $GITHUB_ENV
+          echo "KUBERNETES_CA_CERT_PATH=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}')" >> $GITHUB_ENV
+
+      - name: Set Kube Auth Config and run server
+        env:
+          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+        if: ${{ matrix.auth-provider == 'kubernetes' }}
+        run: |
+          run_dir=$(mktemp -d)
+          cat <<'EOF' > $run_dir/run.yaml
+          version: '2'
+          image_name: kube
+          apis:
+          - agents
+          - datasetio
+          - eval
+          - inference
+          - safety
+          - scoring
+          - telemetry
+          - tool_runtime
+          - vector_io
+          providers:
+            agents:
+            - provider_id: meta-reference
+              provider_type: inline::meta-reference
+              config:
+                persistence_store:
+                  type: sqlite
+                  namespace: null
+                  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/agents_store.db
+            telemetry:
+            - provider_id: meta-reference
+              provider_type: inline::meta-reference
+              config:
+                service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+                sinks: ${env.TELEMETRY_SINKS:console,sqlite}
+                sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ollama/trace_store.db}
+          server:
+            port: 8321
+          EOF
+          yq eval '.server.auth = {"provider_type": "${{ matrix.auth-provider }}"}' -i $run_dir/run.yaml
+          yq eval '.server.auth.config = {"api_server_url": "${{ env.KUBERNETES_API_SERVER_URL }}", "ca_cert_path": "${{ env.KUBERNETES_CA_CERT_PATH }}"}' -i $run_dir/run.yaml
+          cat $run_dir/run.yaml
+
+          source .venv/bin/activate
+          nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &
+
+      - name: Wait for Llama Stack server to be ready
+        run: |
+          echo "Waiting for Llama Stack server..."
+          for i in {1..30}; do
+            if curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://localhost:8321/v1/health | grep -q "OK"; then
+              echo "Llama Stack server is up!"
+              if grep -q "Enabling authentication with provider: ${{ matrix.auth-provider }}" server.log; then
+                echo "Llama Stack server is configured to use ${{ matrix.auth-provider }} auth"
+                exit 0
+              else
+                echo "Llama Stack server is not configured to use ${{ matrix.auth-provider }} auth"
+                cat server.log
+                exit 1
+              fi
+            fi
+            sleep 1
+          done
+          echo "Llama Stack server failed to start"
+          cat server.log
+          exit 1
+
+      - name: Test auth
+        run: |
+          curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers|jq
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -33,19 +33,24 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install uv
-        uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1
+        uses: astral-sh/setup-uv@c7f87aa956e4c323abf06d5dec078e358f6b4d04 # v6.0.0
        with:
          python-version: "3.10"
+          activate-environment: true

      - name: Install and start Ollama
        run: |
          # the ollama installer also starts the ollama service
          curl -fsSL https://ollama.com/install.sh | sh

-      - name: Pull Ollama image
+      # Do NOT cache models - pulling the cache is actually slower than just pulling the model.
+      # It takes ~45 seconds to pull the models from the cache and unpack it, but only 30 seconds to
+      # pull them directly.
+      # Maybe this is because the cache is being pulled at the same time by all the matrix jobs?
+      - name: Pull Ollama models (instruct and embed)
        run: |
-          # TODO: cache the model. OLLAMA_MODELS defaults to ~ollama/.ollama/models.
          ollama pull llama3.2:3b-instruct-fp16
+          ollama pull all-minilm:latest

      - name: Set Up Environment and Install Dependencies
        run: |
@ -105,3 +110,16 @@ jobs:
            -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
            --text-model="meta-llama/Llama-3.2-3B-Instruct" \
            --embedding-model=all-MiniLM-L6-v2
+
+      - name: Write ollama logs to file
+        run: |
+          sudo journalctl -u ollama.service > ollama.log
+
+      - name: Upload all logs to artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}
+          path: |
+            *.log
+          retention-days: 1
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -56,7 +56,7 @@ jobs:
          python-version: '3.10'

      - name: Install uv
-        uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1
+        uses: astral-sh/setup-uv@c7f87aa956e4c323abf06d5dec078e358f6b4d04 # v6.0.0
        with:
          python-version: "3.10"

@ -94,7 +94,7 @@ jobs:
          python-version: '3.10'

      - name: Install uv
-        uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1
+        uses: astral-sh/setup-uv@c7f87aa956e4c323abf06d5dec078e358f6b4d04 # v6.0.0
        with:
          python-version: "3.10"

@ -120,7 +120,7 @@ jobs:
          python-version: '3.10'

      - name: Install uv
-        uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1
+        uses: astral-sh/setup-uv@c7f87aa956e4c323abf06d5dec078e358f6b4d04 # v6.0.0
        with:
          python-version: "3.10"

@ -145,3 +145,56 @@ jobs:
            echo "Entrypoint is not correct"
            exit 1
          fi
+
+  build-ubi9-container-distribution:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Set up Python
+        uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
+        with:
+          python-version: '3.10'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1
+        with:
+          python-version: "3.10"
+
+      - name: Install LlamaStack
+        run: |
+          uv venv
+          source .venv/bin/activate
+          uv pip install -e .
+
+      - name: Pin template to UBI9 base
+        run: |
+          yq -i '
+            .image_type    = "container" |
+            .image_name    = "ubi9-test" |
+            .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
+          ' llama_stack/templates/dev/build.yaml
+
+      - name: Build dev container (UBI9)
+        env:
+          USE_COPY_NOT_MOUNT: "true"
+          LLAMA_STACK_DIR: "."
+        run: |
+          uv run llama stack build --config llama_stack/templates/dev/build.yaml
+
+      - name: Inspect UBI9 image
+        run: |
+          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
+          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
+          echo "Entrypoint: $entrypoint"
+          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
+            echo "Entrypoint is not correct"
+            exit 1
+          fi
+
+          echo "Checking /etc/os-release in $IMAGE_ID"
+          docker run --rm --entrypoint sh "$IMAGE_ID" -c \
+              'source /etc/os-release && echo "$ID"' \
+              | grep -qE '^(rhel|ubi)$' \
+              || { echo "Base image is not UBI 9!"; exit 1; }
--- a/.github/workflows/test-external-providers.yml
+++ b/.github/workflows/test-external-providers.yml
@ -26,22 +26,10 @@ jobs:
        uses: actions/checkout@v4

      - name: Install uv
-        uses: astral-sh/setup-uv@v5
+        uses: astral-sh/setup-uv@v6
        with:
          python-version: "3.10"

-      - name: Install Ollama
-        run: |
-          curl -fsSL https://ollama.com/install.sh | sh
-
-      - name: Pull Ollama image
-        run: |
-          ollama pull llama3.2:3b-instruct-fp16
-
-      - name: Start Ollama in background
-        run: |
-          nohup ollama run llama3.2:3b-instruct-fp16 --keepalive=30m > ollama.log 2>&1 &
-
      - name: Set Up Environment and Install Dependencies
        run: |
          uv sync --extra dev --extra test
@ -66,21 +54,6 @@ jobs:
        run: |
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml

-      - name: Wait for Ollama to start
-        run: |
-          echo "Waiting for Ollama..."
-          for i in {1..30}; do
-            if curl -s http://localhost:11434 | grep -q "Ollama is running"; then
-              echo "Ollama is running!"
-              exit 0
-            fi
-            sleep 1
-          done
-          echo "Ollama failed to start"
-          ollama ps
-          ollama.log
-          exit 1
-
      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
        env:
@ -92,24 +65,14 @@ jobs:

      - name: Wait for Llama Stack server to be ready
        run: |
-          echo "Waiting for Llama Stack server..."
          for i in {1..30}; do
-            if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
-              echo "Llama Stack server is up!"
-              if grep -q "remote::custom_ollama from /tmp/providers.d/remote/inference/custom_ollama.yaml" server.log; then
-                echo "Llama Stack server is using custom Ollama provider"
-                exit 0
-              else
-                echo "Llama Stack server is not using custom Ollama provider"
-                exit 1
-              fi
-            fi
+            if ! grep -q "remote::custom_ollama from /tmp/providers.d/remote/inference/custom_ollama.yaml" server.log; then
+              echo "Waiting for Llama Stack server to load the provider..."
              sleep 1
+            else
+              echo "Provider loaded"
+              exit 0
+            fi
          done
-          echo "Llama Stack server failed to start"
-          cat server.log
+          echo "Provider failed to load"
          exit 1
-
-      - name: run inference tests
-        run: |
-          uv run pytest -v tests/integration/inference/test_text_inference.py --stack-config="http://localhost:8321" --text-model="meta-llama/Llama-3.2-3B-Instruct" --embedding-model=all-MiniLM-L6-v2
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@ -37,7 +37,7 @@ jobs:
        with:
          python-version: ${{ matrix.python }}

-      - uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1
+      - uses: astral-sh/setup-uv@c7f87aa956e4c323abf06d5dec078e358f6b4d04 # v6.0.0
        with:
          python-version: ${{ matrix.python }}
          enable-cache: false
--- a/.github/workflows/update-readthedocs.yml
+++ b/.github/workflows/update-readthedocs.yml
@ -41,7 +41,7 @@ jobs:
          python-version: '3.11'

      - name: Install the latest version of uv
-        uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1
+        uses: astral-sh/setup-uv@c7f87aa956e4c323abf06d5dec078e358f6b4d04 # v6.0.0

      - name: Sync with uv
        run: uv sync --extra docs
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -497,6 +497,54 @@
                }
            }
        },
+        "/v1/openai/v1/responses": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "Runtime representation of an annotated type.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/OpenAIResponseObject"
+                                }
+                            },
+                            "text/event-stream": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/OpenAIResponseObjectStream"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Agents"
+                ],
+                "description": "Create a new OpenAI response.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/CreateOpenaiResponseRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
        "/v1/files": {
            "get": {
                "responses": {
@ -1278,6 +1326,49 @@
                ]
            }
        },
+        "/v1/openai/v1/responses/{id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "An OpenAIResponseObject.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/OpenAIResponseObject"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Agents"
+                ],
+                "description": "Retrieve an OpenAI response by its ID.",
+                "parameters": [
+                    {
+                        "name": "id",
+                        "in": "path",
+                        "description": "The ID of the OpenAI response to retrieve.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
        "/v1/scoring-functions/{scoring_fn_id}": {
            "get": {
                "responses": {
@ -6192,6 +6283,427 @@
                ],
                "title": "AgentTurnResponseTurnStartPayload"
            },
+            "OpenAIResponseInputMessage": {
+                "type": "object",
+                "properties": {
+                    "content": {
+                        "oneOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "array",
+                                "items": {
+                                    "$ref": "#/components/schemas/OpenAIResponseInputMessageContent"
+                                }
+                            }
+                        ]
+                    },
+                    "role": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "const": "system"
+                            },
+                            {
+                                "type": "string",
+                                "const": "developer"
+                            },
+                            {
+                                "type": "string",
+                                "const": "user"
+                            },
+                            {
+                                "type": "string",
+                                "const": "assistant"
+                            }
+                        ]
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "message",
+                        "default": "message"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "content",
+                    "role"
+                ],
+                "title": "OpenAIResponseInputMessage"
+            },
+            "OpenAIResponseInputMessageContent": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseInputMessageContentText"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseInputMessageContentImage"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "input_text": "#/components/schemas/OpenAIResponseInputMessageContentText",
+                        "input_image": "#/components/schemas/OpenAIResponseInputMessageContentImage"
+                    }
+                }
+            },
+            "OpenAIResponseInputMessageContentImage": {
+                "type": "object",
+                "properties": {
+                    "detail": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "const": "low"
+                            },
+                            {
+                                "type": "string",
+                                "const": "high"
+                            },
+                            {
+                                "type": "string",
+                                "const": "auto"
+                            }
+                        ],
+                        "default": "auto"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "input_image",
+                        "default": "input_image"
+                    },
+                    "image_url": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "detail",
+                    "type"
+                ],
+                "title": "OpenAIResponseInputMessageContentImage"
+            },
+            "OpenAIResponseInputMessageContentText": {
+                "type": "object",
+                "properties": {
+                    "text": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "input_text",
+                        "default": "input_text"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "text",
+                    "type"
+                ],
+                "title": "OpenAIResponseInputMessageContentText"
+            },
+            "OpenAIResponseInputTool": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "const": "web_search"
+                            },
+                            {
+                                "type": "string",
+                                "const": "web_search_preview_2025_03_11"
+                            }
+                        ],
+                        "default": "web_search"
+                    },
+                    "search_context_size": {
+                        "type": "string",
+                        "default": "medium"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "OpenAIResponseInputToolWebSearch"
+            },
+            "CreateOpenaiResponseRequest": {
+                "type": "object",
+                "properties": {
+                    "input": {
+                        "oneOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "array",
+                                "items": {
+                                    "$ref": "#/components/schemas/OpenAIResponseInputMessage"
+                                }
+                            }
+                        ],
+                        "description": "Input message(s) to create the response."
+                    },
+                    "model": {
+                        "type": "string",
+                        "description": "The underlying LLM used for completions."
+                    },
+                    "previous_response_id": {
+                        "type": "string",
+                        "description": "(Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses."
+                    },
+                    "store": {
+                        "type": "boolean"
+                    },
+                    "stream": {
+                        "type": "boolean"
+                    },
+                    "tools": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/OpenAIResponseInputTool"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "input",
+                    "model"
+                ],
+                "title": "CreateOpenaiResponseRequest"
+            },
+            "OpenAIResponseError": {
+                "type": "object",
+                "properties": {
+                    "code": {
+                        "type": "string"
+                    },
+                    "message": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "code",
+                    "message"
+                ],
+                "title": "OpenAIResponseError"
+            },
+            "OpenAIResponseObject": {
+                "type": "object",
+                "properties": {
+                    "created_at": {
+                        "type": "integer"
+                    },
+                    "error": {
+                        "$ref": "#/components/schemas/OpenAIResponseError"
+                    },
+                    "id": {
+                        "type": "string"
+                    },
+                    "model": {
+                        "type": "string"
+                    },
+                    "object": {
+                        "type": "string",
+                        "const": "response",
+                        "default": "response"
+                    },
+                    "output": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/OpenAIResponseOutput"
+                        }
+                    },
+                    "parallel_tool_calls": {
+                        "type": "boolean",
+                        "default": false
+                    },
+                    "previous_response_id": {
+                        "type": "string"
+                    },
+                    "status": {
+                        "type": "string"
+                    },
+                    "temperature": {
+                        "type": "number"
+                    },
+                    "top_p": {
+                        "type": "number"
+                    },
+                    "truncation": {
+                        "type": "string"
+                    },
+                    "user": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "created_at",
+                    "id",
+                    "model",
+                    "object",
+                    "output",
+                    "parallel_tool_calls",
+                    "status"
+                ],
+                "title": "OpenAIResponseObject"
+            },
+            "OpenAIResponseOutput": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseOutputMessage"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "message": "#/components/schemas/OpenAIResponseOutputMessage",
+                        "web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
+                    }
+                }
+            },
+            "OpenAIResponseOutputMessage": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "string"
+                    },
+                    "content": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/OpenAIResponseOutputMessageContent"
+                        }
+                    },
+                    "role": {
+                        "type": "string",
+                        "const": "assistant",
+                        "default": "assistant"
+                    },
+                    "status": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "message",
+                        "default": "message"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "id",
+                    "content",
+                    "role",
+                    "status",
+                    "type"
+                ],
+                "title": "OpenAIResponseOutputMessage"
+            },
+            "OpenAIResponseOutputMessageContent": {
+                "type": "object",
+                "properties": {
+                    "text": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "output_text",
+                        "default": "output_text"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "text",
+                    "type"
+                ],
+                "title": "OpenAIResponseOutputMessageContentOutputText"
+            },
+            "OpenAIResponseOutputMessageWebSearchToolCall": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "string"
+                    },
+                    "status": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "web_search_call",
+                        "default": "web_search_call"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "id",
+                    "status",
+                    "type"
+                ],
+                "title": "OpenAIResponseOutputMessageWebSearchToolCall"
+            },
+            "OpenAIResponseObjectStream": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCreated"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "response.created": "#/components/schemas/OpenAIResponseObjectStreamResponseCreated",
+                        "response.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
+                    }
+                }
+            },
+            "OpenAIResponseObjectStreamResponseCompleted": {
+                "type": "object",
+                "properties": {
+                    "response": {
+                        "$ref": "#/components/schemas/OpenAIResponseObject"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.completed",
+                        "default": "response.completed"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "response",
+                    "type"
+                ],
+                "title": "OpenAIResponseObjectStreamResponseCompleted"
+            },
+            "OpenAIResponseObjectStreamResponseCreated": {
+                "type": "object",
+                "properties": {
+                    "response": {
+                        "$ref": "#/components/schemas/OpenAIResponseObject"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.created",
+                        "default": "response.created"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "response",
+                    "type"
+                ],
+                "title": "OpenAIResponseObjectStreamResponseCreated"
+            },
            "CreateUploadSessionRequest": {
                "type": "object",
                "properties": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -330,6 +330,39 @@ paths:
            schema:
              $ref: '#/components/schemas/CreateAgentTurnRequest'
        required: true
+  /v1/openai/v1/responses:
+    post:
+      responses:
+        '200':
+          description: >-
+            Runtime representation of an annotated type.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/OpenAIResponseObject'
+            text/event-stream:
+              schema:
+                $ref: '#/components/schemas/OpenAIResponseObjectStream'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Agents
+      description: Create a new OpenAI response.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/CreateOpenaiResponseRequest'
+        required: true
  /v1/files:
    get:
      responses:
@ -875,6 +908,36 @@ paths:
          required: true
          schema:
            type: string
+  /v1/openai/v1/responses/{id}:
+    get:
+      responses:
+        '200':
+          description: An OpenAIResponseObject.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/OpenAIResponseObject'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Agents
+      description: Retrieve an OpenAI response by its ID.
+      parameters:
+        - name: id
+          in: path
+          description: >-
+            The ID of the OpenAI response to retrieve.
+          required: true
+          schema:
+            type: string
  /v1/scoring-functions/{scoring_fn_id}:
    get:
      responses:
@ -4329,6 +4392,293 @@ components:
        - event_type
        - turn_id
      title: AgentTurnResponseTurnStartPayload
+    OpenAIResponseInputMessage:
+      type: object
+      properties:
+        content:
+          oneOf:
+            - type: string
+            - type: array
+              items:
+                $ref: '#/components/schemas/OpenAIResponseInputMessageContent'
+        role:
+          oneOf:
+            - type: string
+              const: system
+            - type: string
+              const: developer
+            - type: string
+              const: user
+            - type: string
+              const: assistant
+        type:
+          type: string
+          const: message
+          default: message
+      additionalProperties: false
+      required:
+        - content
+        - role
+      title: OpenAIResponseInputMessage
+    OpenAIResponseInputMessageContent:
+      oneOf:
+        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentText'
+        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentImage'
+      discriminator:
+        propertyName: type
+        mapping:
+          input_text: '#/components/schemas/OpenAIResponseInputMessageContentText'
+          input_image: '#/components/schemas/OpenAIResponseInputMessageContentImage'
+    OpenAIResponseInputMessageContentImage:
+      type: object
+      properties:
+        detail:
+          oneOf:
+            - type: string
+              const: low
+            - type: string
+              const: high
+            - type: string
+              const: auto
+          default: auto
+        type:
+          type: string
+          const: input_image
+          default: input_image
+        image_url:
+          type: string
+      additionalProperties: false
+      required:
+        - detail
+        - type
+      title: OpenAIResponseInputMessageContentImage
+    OpenAIResponseInputMessageContentText:
+      type: object
+      properties:
+        text:
+          type: string
+        type:
+          type: string
+          const: input_text
+          default: input_text
+      additionalProperties: false
+      required:
+        - text
+        - type
+      title: OpenAIResponseInputMessageContentText
+    OpenAIResponseInputTool:
+      type: object
+      properties:
+        type:
+          oneOf:
+            - type: string
+              const: web_search
+            - type: string
+              const: web_search_preview_2025_03_11
+          default: web_search
+        search_context_size:
+          type: string
+          default: medium
+      additionalProperties: false
+      required:
+        - type
+      title: OpenAIResponseInputToolWebSearch
+    CreateOpenaiResponseRequest:
+      type: object
+      properties:
+        input:
+          oneOf:
+            - type: string
+            - type: array
+              items:
+                $ref: '#/components/schemas/OpenAIResponseInputMessage'
+          description: Input message(s) to create the response.
+        model:
+          type: string
+          description: The underlying LLM used for completions.
+        previous_response_id:
+          type: string
+          description: >-
+            (Optional) if specified, the new response will be a continuation of the
+            previous response. This can be used to easily fork-off new responses from
+            existing responses.
+        store:
+          type: boolean
+        stream:
+          type: boolean
+        tools:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAIResponseInputTool'
+      additionalProperties: false
+      required:
+        - input
+        - model
+      title: CreateOpenaiResponseRequest
+    OpenAIResponseError:
+      type: object
+      properties:
+        code:
+          type: string
+        message:
+          type: string
+      additionalProperties: false
+      required:
+        - code
+        - message
+      title: OpenAIResponseError
+    OpenAIResponseObject:
+      type: object
+      properties:
+        created_at:
+          type: integer
+        error:
+          $ref: '#/components/schemas/OpenAIResponseError'
+        id:
+          type: string
+        model:
+          type: string
+        object:
+          type: string
+          const: response
+          default: response
+        output:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAIResponseOutput'
+        parallel_tool_calls:
+          type: boolean
+          default: false
+        previous_response_id:
+          type: string
+        status:
+          type: string
+        temperature:
+          type: number
+        top_p:
+          type: number
+        truncation:
+          type: string
+        user:
+          type: string
+      additionalProperties: false
+      required:
+        - created_at
+        - id
+        - model
+        - object
+        - output
+        - parallel_tool_calls
+        - status
+      title: OpenAIResponseObject
+    OpenAIResponseOutput:
+      oneOf:
+        - $ref: '#/components/schemas/OpenAIResponseOutputMessage'
+        - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+      discriminator:
+        propertyName: type
+        mapping:
+          message: '#/components/schemas/OpenAIResponseOutputMessage'
+          web_search_call: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+    OpenAIResponseOutputMessage:
+      type: object
+      properties:
+        id:
+          type: string
+        content:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAIResponseOutputMessageContent'
+        role:
+          type: string
+          const: assistant
+          default: assistant
+        status:
+          type: string
+        type:
+          type: string
+          const: message
+          default: message
+      additionalProperties: false
+      required:
+        - id
+        - content
+        - role
+        - status
+        - type
+      title: OpenAIResponseOutputMessage
+    OpenAIResponseOutputMessageContent:
+      type: object
+      properties:
+        text:
+          type: string
+        type:
+          type: string
+          const: output_text
+          default: output_text
+      additionalProperties: false
+      required:
+        - text
+        - type
+      title: >-
+        OpenAIResponseOutputMessageContentOutputText
+    "OpenAIResponseOutputMessageWebSearchToolCall":
+      type: object
+      properties:
+        id:
+          type: string
+        status:
+          type: string
+        type:
+          type: string
+          const: web_search_call
+          default: web_search_call
+      additionalProperties: false
+      required:
+        - id
+        - status
+        - type
+      title: >-
+        OpenAIResponseOutputMessageWebSearchToolCall
+    OpenAIResponseObjectStream:
+      oneOf:
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
+      discriminator:
+        propertyName: type
+        mapping:
+          response.created: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
+          response.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
+    "OpenAIResponseObjectStreamResponseCompleted":
+      type: object
+      properties:
+        response:
+          $ref: '#/components/schemas/OpenAIResponseObject'
+        type:
+          type: string
+          const: response.completed
+          default: response.completed
+      additionalProperties: false
+      required:
+        - response
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseCompleted
+    "OpenAIResponseObjectStreamResponseCreated":
+      type: object
+      properties:
+        response:
+          $ref: '#/components/schemas/OpenAIResponseObject'
+        type:
+          type: string
+          const: response.created
+          default: response.created
+      additionalProperties: false
+      required:
+        - response
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseCreated
    CreateUploadSessionRequest:
      type: object
      properties:
--- a/docs/getting_started_llama_api.ipynb
+++ b/docs/getting_started_llama_api.ipynb
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -179,7 +179,7 @@ class ContentBuilder:
        "Creates the content subtree for a request or response."

        def is_iterator_type(t):
-            return "StreamChunk" in str(t)
+            return "StreamChunk" in str(t) or "OpenAIResponseObjectStream" in str(t)

        def get_media_type(t):
            if is_generic_list(t):
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@ -53,6 +53,13 @@ models:
  provider_id: ollama
  provider_model_id: null
 shields: []
+server:
+  port: 8321
+  auth:
+    provider_type: "kubernetes"
+    config:
+      api_server_url: "https://kubernetes.default.svc"
+      ca_cert_path: "/path/to/ca.crt"
 ```

 Let's break this down into the different sections. The first section specifies the set of APIs that the stack server will serve:
@ -102,6 +109,105 @@ A Model is an instance of a "Resource" (see [Concepts](../concepts/index)) and i

 What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. Contrast it with `model_id` which is the identifier for the same model for Llama Stack's purposes. For example, you may want to name "llama3.2:vision-11b" as "image_captioning_model" when you use it in your Stack interactions. When omitted, the server will set `provider_model_id` to be the same as `model_id`.

+## Server Configuration
+
+The `server` section configures the HTTP server that serves the Llama Stack APIs:
+
+```yaml
+server:
+  port: 8321  # Port to listen on (default: 8321)
+  tls_certfile: "/path/to/cert.pem"  # Optional: Path to TLS certificate for HTTPS
+  tls_keyfile: "/path/to/key.pem"    # Optional: Path to TLS key for HTTPS
+  auth:                              # Optional: Authentication configuration
+    provider_type: "kubernetes"      # Type of auth provider
+    config:                          # Provider-specific configuration
+      api_server_url: "https://kubernetes.default.svc"
+      ca_cert_path: "/path/to/ca.crt" # Optional: Path to CA certificate
+```
+
+### Authentication Configuration
+
+The `auth` section configures authentication for the server. When configured, all API requests must include a valid Bearer token in the Authorization header:
+
+```
+Authorization: Bearer <token>
+```
+
+The server supports multiple authentication providers:
+
+#### Kubernetes Provider
+
+The Kubernetes cluster must be configured to use a service account for authentication.
+
+```bash
+kubectl create namespace llama-stack
+kubectl create serviceaccount llama-stack-auth -n llama-stack
+kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --serviceaccount=llama-stack:llama-stack-auth -n llama-stack
+kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
+```
+
+Validates tokens against the Kubernetes API server:
+```yaml
+server:
+  auth:
+    provider_type: "kubernetes"
+    config:
+      api_server_url: "https://kubernetes.default.svc"  # URL of the Kubernetes API server
+      ca_cert_path: "/path/to/ca.crt"                   # Optional: Path to CA certificate
+```
+
+The provider extracts user information from the JWT token:
+- Username from the `sub` claim becomes a role
+- Kubernetes groups become teams
+
+You can easily validate a request by running:
+
+```bash
+curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers
+```
+
+#### Custom Provider
+Validates tokens against a custom authentication endpoint:
+```yaml
+server:
+  auth:
+    provider_type: "custom"
+    config:
+      endpoint: "https://auth.example.com/validate"  # URL of the auth endpoint
+```
+
+The custom endpoint receives a POST request with:
+```json
+{
+  "api_key": "<token>",
+  "request": {
+    "path": "/api/v1/endpoint",
+    "headers": {
+      "content-type": "application/json",
+      "user-agent": "curl/7.64.1"
+    },
+    "params": {
+      "key": ["value"]
+    }
+  }
+}
+```
+
+And must respond with:
+```json
+{
+  "access_attributes": {
+    "roles": ["admin", "user"],
+    "teams": ["ml-team", "nlp-team"],
+    "projects": ["llama-3", "project-x"],
+    "namespaces": ["research"]
+  },
+  "message": "Authentication successful"
+}
+```
+
+If no access attributes are returned, the token is used as a namespace.
+
 ## Extending to handle Safety

 Configuring Safety can be a little involved so it is instructive to go through an example.
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -38,6 +38,13 @@ from llama_stack.apis.safety import SafetyViolation
 from llama_stack.apis.tools import ToolDef
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod

+from .openai_responses import (
+    OpenAIResponseInputMessage,
+    OpenAIResponseInputTool,
+    OpenAIResponseObject,
+    OpenAIResponseObjectStream,
+)
+

 class Attachment(BaseModel):
    """An attachment to an agent turn.
@ -593,3 +600,39 @@ class Agents(Protocol):
        :returns: A ListAgentSessionsResponse.
        """
        ...
+
+    # We situate the OpenAI Responses API in the Agents API just like we did things
+    # for Inference. The Responses API, in its intent, serves the same purpose as
+    # the Agents API above -- it is essentially a lightweight "agentic loop" with
+    # integrated tool calling.
+    #
+    # Both of these APIs are inherently stateful.
+
+    @webmethod(route="/openai/v1/responses/{id}", method="GET")
+    async def get_openai_response(
+        self,
+        id: str,
+    ) -> OpenAIResponseObject:
+        """Retrieve an OpenAI response by its ID.
+
+        :param id: The ID of the OpenAI response to retrieve.
+        :returns: An OpenAIResponseObject.
+        """
+        ...
+
+    @webmethod(route="/openai/v1/responses", method="POST")
+    async def create_openai_response(
+        self,
+        input: Union[str, List[OpenAIResponseInputMessage]],
+        model: str,
+        previous_response_id: Optional[str] = None,
+        store: Optional[bool] = True,
+        stream: Optional[bool] = False,
+        tools: Optional[List[OpenAIResponseInputTool]] = None,
+    ) -> Union[OpenAIResponseObject, AsyncIterator[OpenAIResponseObjectStream]]:
+        """Create a new OpenAI response.
+
+        :param input: Input message(s) to create the response.
+        :param model: The underlying LLM used for completions.
+        :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
+        """
--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@ -0,0 +1,140 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List, Literal, Optional, Union
+
+from pydantic import BaseModel, Field
+from typing_extensions import Annotated
+
+from llama_stack.schema_utils import json_schema_type, register_schema
+
+
+@json_schema_type
+class OpenAIResponseError(BaseModel):
+    code: str
+    message: str
+
+
+@json_schema_type
+class OpenAIResponseOutputMessageContentOutputText(BaseModel):
+    text: str
+    type: Literal["output_text"] = "output_text"
+
+
+OpenAIResponseOutputMessageContent = Annotated[
+    Union[OpenAIResponseOutputMessageContentOutputText,],
+    Field(discriminator="type"),
+]
+register_schema(OpenAIResponseOutputMessageContent, name="OpenAIResponseOutputMessageContent")
+
+
+@json_schema_type
+class OpenAIResponseOutputMessage(BaseModel):
+    id: str
+    content: List[OpenAIResponseOutputMessageContent]
+    role: Literal["assistant"] = "assistant"
+    status: str
+    type: Literal["message"] = "message"
+
+
+@json_schema_type
+class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
+    id: str
+    status: str
+    type: Literal["web_search_call"] = "web_search_call"
+
+
+OpenAIResponseOutput = Annotated[
+    Union[
+        OpenAIResponseOutputMessage,
+        OpenAIResponseOutputMessageWebSearchToolCall,
+    ],
+    Field(discriminator="type"),
+]
+register_schema(OpenAIResponseOutput, name="OpenAIResponseOutput")
+
+
+@json_schema_type
+class OpenAIResponseObject(BaseModel):
+    created_at: int
+    error: Optional[OpenAIResponseError] = None
+    id: str
+    model: str
+    object: Literal["response"] = "response"
+    output: List[OpenAIResponseOutput]
+    parallel_tool_calls: bool = False
+    previous_response_id: Optional[str] = None
+    status: str
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    truncation: Optional[str] = None
+    user: Optional[str] = None
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseCreated(BaseModel):
+    response: OpenAIResponseObject
+    type: Literal["response.created"] = "response.created"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseCompleted(BaseModel):
+    response: OpenAIResponseObject
+    type: Literal["response.completed"] = "response.completed"
+
+
+OpenAIResponseObjectStream = Annotated[
+    Union[
+        OpenAIResponseObjectStreamResponseCreated,
+        OpenAIResponseObjectStreamResponseCompleted,
+    ],
+    Field(discriminator="type"),
+]
+register_schema(OpenAIResponseObjectStream, name="OpenAIResponseObjectStream")
+
+
+@json_schema_type
+class OpenAIResponseInputMessageContentText(BaseModel):
+    text: str
+    type: Literal["input_text"] = "input_text"
+
+
+@json_schema_type
+class OpenAIResponseInputMessageContentImage(BaseModel):
+    detail: Literal["low"] | Literal["high"] | Literal["auto"] = "auto"
+    type: Literal["input_image"] = "input_image"
+    # TODO: handle file_id
+    image_url: Optional[str] = None
+
+
+# TODO: handle file content types
+OpenAIResponseInputMessageContent = Annotated[
+    Union[OpenAIResponseInputMessageContentText, OpenAIResponseInputMessageContentImage],
+    Field(discriminator="type"),
+]
+register_schema(OpenAIResponseInputMessageContent, name="OpenAIResponseInputMessageContent")
+
+
+@json_schema_type
+class OpenAIResponseInputMessage(BaseModel):
+    content: Union[str, List[OpenAIResponseInputMessageContent]]
+    role: Literal["system"] | Literal["developer"] | Literal["user"] | Literal["assistant"]
+    type: Optional[Literal["message"]] = "message"
+
+
+@json_schema_type
+class OpenAIResponseInputToolWebSearch(BaseModel):
+    type: Literal["web_search"] | Literal["web_search_preview_2025_03_11"] = "web_search"
+    # TODO: actually use search_context_size somewhere...
+    search_context_size: Optional[str] = Field(default="medium", pattern="^low|medium|high$")
+    # TODO: add user_location
+
+
+OpenAIResponseInputTool = Annotated[
+    Union[OpenAIResponseInputToolWebSearch,],
+    Field(discriminator="type"),
+]
+register_schema(OpenAIResponseInputTool, name="OpenAIResponseInputTool")
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@ -460,15 +460,17 @@ def run_download_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
        from llama_stack.models.llama.sku_list import llama_meta_net_info, resolve_model

        from .model.safety_models import (
-            prompt_guard_download_info,
-            prompt_guard_model_sku,
+            prompt_guard_download_info_map,
+            prompt_guard_model_sku_map,
        )

-        prompt_guard = prompt_guard_model_sku()
+        prompt_guard_model_sku_map = prompt_guard_model_sku_map()
+        prompt_guard_download_info_map = prompt_guard_download_info_map()
+
        for model_id in model_ids:
-            if model_id == prompt_guard.model_id:
-                model = prompt_guard
-                info = prompt_guard_download_info()
+            if model_id in prompt_guard_model_sku_map.keys():
+                model = prompt_guard_model_sku_map[model_id]
+                info = prompt_guard_download_info_map[model_id]
            else:
                model = resolve_model(model_id)
                if model is None:
--- a/llama_stack/cli/model/describe.py
+++ b/llama_stack/cli/model/describe.py
@ -36,11 +36,11 @@ class ModelDescribe(Subcommand):
        )

    def _run_model_describe_cmd(self, args: argparse.Namespace) -> None:
-        from .safety_models import prompt_guard_model_sku
+        from .safety_models import prompt_guard_model_sku_map

-        prompt_guard = prompt_guard_model_sku()
-        if args.model_id == prompt_guard.model_id:
-            model = prompt_guard
+        prompt_guard_model_map = prompt_guard_model_sku_map()
+        if args.model_id in prompt_guard_model_map.keys():
+            model = prompt_guard_model_map[args.model_id]
        else:
            model = resolve_model(args.model_id)

--- a/llama_stack/cli/model/list.py
+++ b/llama_stack/cli/model/list.py
@ -84,7 +84,7 @@ class ModelList(Subcommand):
        )

    def _run_model_list_cmd(self, args: argparse.Namespace) -> None:
-        from .safety_models import prompt_guard_model_sku
+        from .safety_models import prompt_guard_model_skus

        if args.downloaded:
            return _run_model_list_downloaded_cmd()
@ -96,7 +96,7 @@ class ModelList(Subcommand):
        ]

        rows = []
-        for model in all_registered_models() + [prompt_guard_model_sku()]:
+        for model in all_registered_models() + prompt_guard_model_skus():
            if not args.show_all and not model.is_featured:
                continue

--- a/llama_stack/cli/model/remove.py
+++ b/llama_stack/cli/model/remove.py
@ -42,11 +42,12 @@ class ModelRemove(Subcommand):
        )

    def _run_model_remove_cmd(self, args: argparse.Namespace) -> None:
-        from .safety_models import prompt_guard_model_sku
+        from .safety_models import prompt_guard_model_sku_map

-        prompt_guard = prompt_guard_model_sku()
-        if args.model == prompt_guard.model_id:
-            model = prompt_guard
+        prompt_guard_model_map = prompt_guard_model_sku_map()
+
+        if args.model in prompt_guard_model_map.keys():
+            model = prompt_guard_model_map[args.model]
        else:
            model = resolve_model(args.model)

--- a/llama_stack/cli/model/safety_models.py
+++ b/llama_stack/cli/model/safety_models.py
@ -15,11 +15,11 @@ from llama_stack.models.llama.sku_types import CheckpointQuantizationFormat
 class PromptGuardModel(BaseModel):
    """Make a 'fake' Model-like object for Prompt Guard. Eventually this will be removed."""

-    model_id: str = "Prompt-Guard-86M"
+    model_id: str
+    huggingface_repo: str
    description: str = "Prompt Guard. NOTE: this model will not be provided via `llama` CLI soon."
    is_featured: bool = False
-    huggingface_repo: str = "meta-llama/Prompt-Guard-86M"
-    max_seq_length: int = 2048
+    max_seq_length: int = 512
    is_instruct_model: bool = False
    quantization_format: CheckpointQuantizationFormat = CheckpointQuantizationFormat.bf16
    arch_args: Dict[str, Any] = Field(default_factory=dict)
@ -30,13 +30,28 @@ class PromptGuardModel(BaseModel):
    model_config = ConfigDict(protected_namespaces=())


-def prompt_guard_model_sku():
-    return PromptGuardModel()
+def prompt_guard_model_skus():
+    return [
+        PromptGuardModel(model_id="Prompt-Guard-86M", huggingface_repo="meta-llama/Prompt-Guard-86M"),
+        PromptGuardModel(
+            model_id="Llama-Prompt-Guard-2-86M",
+            huggingface_repo="meta-llama/Llama-Prompt-Guard-2-86M",
+        ),
+        PromptGuardModel(
+            model_id="Llama-Prompt-Guard-2-22M",
+            huggingface_repo="meta-llama/Llama-Prompt-Guard-2-22M",
+        ),
+    ]


-def prompt_guard_download_info():
-    return LlamaDownloadInfo(
-        folder="Prompt-Guard",
+def prompt_guard_model_sku_map() -> Dict[str, Any]:
+    return {model.model_id: model for model in prompt_guard_model_skus()}
+
+
+def prompt_guard_download_info_map() -> Dict[str, LlamaDownloadInfo]:
+    return {
+        model.model_id: LlamaDownloadInfo(
+            folder="Prompt-Guard" if model.model_id == "Prompt-Guard-86M" else model.model_id,
            files=[
                "model.safetensors",
                "special_tokens_map.json",
@ -45,3 +60,5 @@ def prompt_guard_download_info():
            ],
            pth_size=1,
        )
+        for model in prompt_guard_model_skus()
+    }
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from enum import Enum
 from typing import Annotated, Any, Dict, List, Optional, Union

 from pydantic import BaseModel, Field
@ -235,10 +236,21 @@ class LoggingConfig(BaseModel):
    )


+class AuthProviderType(str, Enum):
+    """Supported authentication provider types."""
+
+    KUBERNETES = "kubernetes"
+    CUSTOM = "custom"
+
+
 class AuthenticationConfig(BaseModel):
-    endpoint: str = Field(
+    provider_type: AuthProviderType = Field(
        ...,
-        description="Endpoint URL to validate authentication tokens",
+        description="Type of authentication provider (e.g., 'kubernetes', 'custom')",
+    )
+    config: Dict[str, str] = Field(
+        ...,
+        description="Provider-specific configuration",
    )


--- a/llama_stack/distribution/server/auth.py
+++ b/llama_stack/distribution/server/auth.py
@ -5,74 +5,29 @@
 # the root directory of this source tree.

 import json
-from typing import Dict, List, Optional
-from urllib.parse import parse_qs

 import httpx
-from pydantic import BaseModel, Field

-from llama_stack.distribution.datatypes import AccessAttributes
+from llama_stack.distribution.server.auth_providers import AuthProviderConfig, create_auth_provider
 from llama_stack.log import get_logger

 logger = get_logger(name=__name__, category="auth")


-class AuthRequestContext(BaseModel):
-    path: str = Field(description="The path of the request being authenticated")
-
-    headers: Dict[str, str] = Field(description="HTTP headers from the original request (excluding Authorization)")
-
-    params: Dict[str, List[str]] = Field(
-        description="Query parameters from the original request, parsed as dictionary of lists"
-    )
-
-
-class AuthRequest(BaseModel):
-    api_key: str = Field(description="The API key extracted from the Authorization header")
-
-    request: AuthRequestContext = Field(description="Context information about the request being authenticated")
-
-
-class AuthResponse(BaseModel):
-    """The format of the authentication response from the auth endpoint."""
-
-    access_attributes: Optional[AccessAttributes] = Field(
-        default=None,
-        description="""
-        Structured user attributes for attribute-based access control.
-
-        These attributes determine which resources the user can access.
-        The model provides standard categories like "roles", "teams", "projects", and "namespaces".
-        Each attribute category contains a list of values that the user has for that category.
-        During access control checks, these values are compared against resource requirements.
-
-        Example with standard categories:
-        ```json
-        {
-            "roles": ["admin", "data-scientist"],
-            "teams": ["ml-team"],
-            "projects": ["llama-3"],
-            "namespaces": ["research"]
-        }
-        ```
-        """,
-    )
-
-    message: Optional[str] = Field(
-        default=None, description="Optional message providing additional context about the authentication result."
-    )
-
-
 class AuthenticationMiddleware:
-    """Middleware that authenticates requests using an external auth endpoint.
+    """Middleware that authenticates requests using configured authentication provider.

    This middleware:
    1. Extracts the Bearer token from the Authorization header
-    2. Sends it to the configured auth endpoint along with request details
-    3. Validates the response and extracts user attributes
+    2. Uses the configured auth provider to validate the token
+    3. Extracts user attributes from the provider's response
    4. Makes these attributes available to the route handlers for access control

-    Authentication Request Format:
+    The middleware supports multiple authentication providers through the AuthProvider interface:
+    - Kubernetes: Validates tokens against the Kubernetes API server
+    - Custom: Validates tokens against a custom endpoint
+
+    Authentication Request Format for Custom Auth Provider:
    ```json
    {
        "api_key": "the-api-key-extracted-from-auth-header",
@ -105,21 +60,26 @@ class AuthenticationMiddleware:
    }
    ```

+    Token Validation:
+    Each provider implements its own token validation logic:
+    - Kubernetes: Uses TokenReview API to validate service account tokens
+    - Custom: Sends token to custom endpoint for validation
+
    Attribute-Based Access Control:
-    The attributes returned by the auth endpoint are used to determine which
+    The attributes returned by the auth provider are used to determine which
    resources the user can access. Resources can specify required attributes
    using the access_attributes field. For a user to access a resource:

    1. All attribute categories specified in the resource must be present in the user's attributes
    2. For each category, the user must have at least one matching value

-    If the auth endpoint doesn't return any attributes, the user will only be able to
+    If the auth provider doesn't return any attributes, the user will only be able to
    access resources that don't have access_attributes defined.
    """

-    def __init__(self, app, auth_endpoint):
+    def __init__(self, app, auth_config: AuthProviderConfig):
        self.app = app
-        self.auth_endpoint = auth_endpoint
+        self.auth_provider = create_auth_provider(auth_config)

    async def __call__(self, scope, receive, send):
        if scope["type"] == "http":
@ -129,66 +89,34 @@ class AuthenticationMiddleware:
            if not auth_header or not auth_header.startswith("Bearer "):
                return await self._send_auth_error(send, "Missing or invalid Authorization header")

-            api_key = auth_header.split("Bearer ", 1)[1]
+            token = auth_header.split("Bearer ", 1)[1]

-            path = scope.get("path", "")
-            request_headers = {k.decode(): v.decode() for k, v in headers.items()}
-
-            # Remove sensitive headers
-            if "authorization" in request_headers:
-                del request_headers["authorization"]
-
-            query_string = scope.get("query_string", b"").decode()
-            params = parse_qs(query_string)
-
-            # Build the auth request model
-            auth_request = AuthRequest(
-                api_key=api_key,
-                request=AuthRequestContext(
-                    path=path,
-                    headers=request_headers,
-                    params=params,
-                ),
-            )
-
-            # Validate with authentication endpoint
+            # Validate token and get access attributes
            try:
-                async with httpx.AsyncClient() as client:
-                    response = await client.post(
-                        self.auth_endpoint,
-                        json=auth_request.model_dump(),
-                        timeout=10.0,  # Add a reasonable timeout
-                    )
-                    if response.status_code != 200:
-                        logger.warning(f"Authentication failed: {response.status_code}")
-                        return await self._send_auth_error(send, "Authentication failed")
-
-                    # Parse and validate the auth response
-                    try:
-                        response_data = response.json()
-                        auth_response = AuthResponse(**response_data)
-
-                        # Store attributes in request scope for access control
-                        if auth_response.access_attributes:
-                            user_attributes = auth_response.access_attributes.model_dump(exclude_none=True)
-                        else:
-                            logger.warning("No access attributes, setting namespace to api_key by default")
-                            user_attributes = {
-                                "namespaces": [api_key],
-                            }
-
-                        scope["user_attributes"] = user_attributes
-                        logger.debug(f"Authentication successful: {len(user_attributes)} attributes")
-                    except Exception:
-                        logger.exception("Error parsing authentication response")
-                        return await self._send_auth_error(send, "Invalid authentication response format")
+                access_attributes = await self.auth_provider.validate_token(token, scope)
            except httpx.TimeoutException:
                logger.exception("Authentication request timed out")
                return await self._send_auth_error(send, "Authentication service timeout")
+            except ValueError as e:
+                logger.exception("Error during authentication")
+                return await self._send_auth_error(send, str(e))
            except Exception:
                logger.exception("Error during authentication")
                return await self._send_auth_error(send, "Authentication service error")

+            # Store attributes in request scope for access control
+            if access_attributes:
+                user_attributes = access_attributes.model_dump(exclude_none=True)
+            else:
+                logger.warning("No access attributes, setting namespace to token by default")
+                user_attributes = {
+                    "namespaces": [token],
+                }
+
+            # Store attributes in request scope
+            scope["user_attributes"] = user_attributes
+            logger.debug(f"Authentication successful: {len(scope['user_attributes'])} attributes")
+
        return await self.app(scope, receive, send)

    async def _send_auth_error(self, send, message):
--- a/llama_stack/distribution/server/auth_providers.py
+++ b/llama_stack/distribution/server/auth_providers.py
@ -0,0 +1,262 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import Dict, List, Optional
+from urllib.parse import parse_qs
+
+import httpx
+from pydantic import BaseModel, Field
+
+from llama_stack.distribution.datatypes import AccessAttributes
+from llama_stack.log import get_logger
+
+logger = get_logger(name=__name__, category="auth")
+
+
+class AuthResponse(BaseModel):
+    """The format of the authentication response from the auth endpoint."""
+
+    access_attributes: Optional[AccessAttributes] = Field(
+        default=None,
+        description="""
+        Structured user attributes for attribute-based access control.
+
+        These attributes determine which resources the user can access.
+        The model provides standard categories like "roles", "teams", "projects", and "namespaces".
+        Each attribute category contains a list of values that the user has for that category.
+        During access control checks, these values are compared against resource requirements.
+
+        Example with standard categories:
+        ```json
+        {
+            "roles": ["admin", "data-scientist"],
+            "teams": ["ml-team"],
+            "projects": ["llama-3"],
+            "namespaces": ["research"]
+        }
+        ```
+        """,
+    )
+
+    message: Optional[str] = Field(
+        default=None, description="Optional message providing additional context about the authentication result."
+    )
+
+
+class AuthRequestContext(BaseModel):
+    path: str = Field(description="The path of the request being authenticated")
+
+    headers: Dict[str, str] = Field(description="HTTP headers from the original request (excluding Authorization)")
+
+    params: Dict[str, List[str]] = Field(
+        description="Query parameters from the original request, parsed as dictionary of lists"
+    )
+
+
+class AuthRequest(BaseModel):
+    api_key: str = Field(description="The API key extracted from the Authorization header")
+
+    request: AuthRequestContext = Field(description="Context information about the request being authenticated")
+
+
+class AuthProviderType(str, Enum):
+    """Supported authentication provider types."""
+
+    KUBERNETES = "kubernetes"
+    CUSTOM = "custom"
+
+
+class AuthProviderConfig(BaseModel):
+    """Base configuration for authentication providers."""
+
+    provider_type: AuthProviderType = Field(..., description="Type of authentication provider")
+    config: Dict[str, str] = Field(..., description="Provider-specific configuration")
+
+
+class AuthProvider(ABC):
+    """Abstract base class for authentication providers."""
+
+    @abstractmethod
+    async def validate_token(self, token: str, scope: Optional[Dict] = None) -> Optional[AccessAttributes]:
+        """Validate a token and return access attributes."""
+        pass
+
+    @abstractmethod
+    async def close(self):
+        """Clean up any resources."""
+        pass
+
+
+class KubernetesAuthProvider(AuthProvider):
+    """Kubernetes authentication provider that validates tokens against the Kubernetes API server."""
+
+    def __init__(self, config: Dict[str, str]):
+        self.api_server_url = config["api_server_url"]
+        self.ca_cert_path = config.get("ca_cert_path")
+        self._client = None
+
+    async def _get_client(self):
+        """Get or create a Kubernetes client."""
+        if self._client is None:
+            # kubernetes-client has not async support, see:
+            # https://github.com/kubernetes-client/python/issues/323
+            from kubernetes import client
+            from kubernetes.client import ApiClient
+
+            # Configure the client
+            configuration = client.Configuration()
+            configuration.host = self.api_server_url
+            if self.ca_cert_path:
+                configuration.ssl_ca_cert = self.ca_cert_path
+            configuration.verify_ssl = bool(self.ca_cert_path)
+
+            # Create API client
+            self._client = ApiClient(configuration)
+        return self._client
+
+    async def validate_token(self, token: str, scope: Optional[Dict] = None) -> Optional[AccessAttributes]:
+        """Validate a Kubernetes token and return access attributes."""
+        try:
+            client = await self._get_client()
+
+            # Set the token in the client
+            client.set_default_header("Authorization", f"Bearer {token}")
+
+            # Make a request to validate the token
+            # We use the /api endpoint which requires authentication
+            from kubernetes.client import CoreV1Api
+
+            api = CoreV1Api(client)
+            api.get_api_resources(_request_timeout=3.0)  # Set timeout for this specific request
+
+            # If we get here, the token is valid
+            # Extract user info from the token claims
+            import base64
+
+            # Decode the token (without verification since we've already validated it)
+            token_parts = token.split(".")
+            payload = json.loads(base64.b64decode(token_parts[1] + "=" * (-len(token_parts[1]) % 4)))
+
+            # Extract user information from the token
+            username = payload.get("sub", "")
+            groups = payload.get("groups", [])
+
+            return AccessAttributes(
+                roles=[username],  # Use username as a role
+                teams=groups,  # Use Kubernetes groups as teams
+            )
+
+        except Exception as e:
+            logger.exception("Failed to validate Kubernetes token")
+            raise ValueError("Invalid or expired token") from e
+
+    async def close(self):
+        """Close the HTTP client."""
+        if self._client:
+            self._client.close()
+            self._client = None
+
+
+class CustomAuthProvider(AuthProvider):
+    """Custom authentication provider that uses an external endpoint."""
+
+    def __init__(self, config: Dict[str, str]):
+        self.endpoint = config["endpoint"]
+        self._client = None
+
+    async def validate_token(self, token: str, scope: Optional[Dict] = None) -> Optional[AccessAttributes]:
+        """Validate a token using the custom authentication endpoint."""
+        if not self.endpoint:
+            raise ValueError("Authentication endpoint not configured")
+
+        if scope is None:
+            scope = {}
+
+        headers = dict(scope.get("headers", []))
+        path = scope.get("path", "")
+        request_headers = {k.decode(): v.decode() for k, v in headers.items()}
+
+        # Remove sensitive headers
+        if "authorization" in request_headers:
+            del request_headers["authorization"]
+
+        query_string = scope.get("query_string", b"").decode()
+        params = parse_qs(query_string)
+
+        # Build the auth request model
+        auth_request = AuthRequest(
+            api_key=token,
+            request=AuthRequestContext(
+                path=path,
+                headers=request_headers,
+                params=params,
+            ),
+        )
+
+        # Validate with authentication endpoint
+        try:
+            async with httpx.AsyncClient() as client:
+                response = await client.post(
+                    self.endpoint,
+                    json=auth_request.model_dump(),
+                    timeout=10.0,  # Add a reasonable timeout
+                )
+                if response.status_code != 200:
+                    logger.warning(f"Authentication failed with status code: {response.status_code}")
+                    raise ValueError(f"Authentication failed: {response.status_code}")
+
+                # Parse and validate the auth response
+                try:
+                    response_data = response.json()
+                    auth_response = AuthResponse(**response_data)
+
+                    # Store attributes in request scope for access control
+                    if auth_response.access_attributes:
+                        return auth_response.access_attributes
+                    else:
+                        logger.warning("No access attributes, setting namespace to api_key by default")
+                        user_attributes = {
+                            "namespaces": [token],
+                        }
+
+                    scope["user_attributes"] = user_attributes
+                    logger.debug(f"Authentication successful: {len(user_attributes)} attributes")
+                    return auth_response.access_attributes
+                except Exception as e:
+                    logger.exception("Error parsing authentication response")
+                    raise ValueError("Invalid authentication response format") from e
+
+        except httpx.TimeoutException:
+            logger.exception("Authentication request timed out")
+            raise
+        except ValueError:
+            # Re-raise ValueError exceptions to preserve their message
+            raise
+        except Exception as e:
+            logger.exception("Error during authentication")
+            raise ValueError("Authentication service error") from e
+
+    async def close(self):
+        """Close the HTTP client."""
+        if self._client:
+            await self._client.aclose()
+            self._client = None
+
+
+def create_auth_provider(config: AuthProviderConfig) -> AuthProvider:
+    """Factory function to create the appropriate auth provider."""
+    provider_type = config.provider_type.lower()
+
+    if provider_type == "kubernetes":
+        return KubernetesAuthProvider(config.config)
+    elif provider_type == "custom":
+        return CustomAuthProvider(config.config)
+    else:
+        supported_providers = ", ".join([t.value for t in AuthProviderType])
+        raise ValueError(f"Unsupported auth provider type: {provider_type}. Supported types are: {supported_providers}")
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@ -419,9 +419,9 @@ def main(args: Optional[argparse.Namespace] = None):
        app.add_middleware(ClientVersionMiddleware)

    # Add authentication middleware if configured
-    if config.server.auth and config.server.auth.endpoint:
-        logger.info(f"Enabling authentication with endpoint: {config.server.auth.endpoint}")
-        app.add_middleware(AuthenticationMiddleware, auth_endpoint=config.server.auth.endpoint)
+    if config.server.auth:
+        logger.info(f"Enabling authentication with provider: {config.server.auth.provider_type.value}")
+        app.add_middleware(AuthenticationMiddleware, auth_config=config.server.auth)

    try:
        impls = asyncio.run(construct_stack(config))
--- a/llama_stack/distribution/ui/page/playground/tools.py
+++ b/llama_stack/distribution/ui/page/playground/tools.py
@ -94,12 +94,16 @@ def tool_chat_page():
        st.subheader("Agent Configurations")
        st.subheader("Agent Type")
        agent_type = st.radio(
-            "Select Agent Type",
-            [AgentType.REGULAR, AgentType.REACT],
-            format_func=lambda x: x.value,
+            label="Select Agent Type",
+            options=["Regular", "ReAct"],
            on_change=reset_agent,
        )

+        if agent_type == "ReAct":
+            agent_type = AgentType.REACT
+        else:
+            agent_type = AgentType.REGULAR
+
        max_tokens = st.slider(
            "Max Tokens",
            min_value=0,
--- a/llama_stack/models/llama/sku_list.py
+++ b/llama_stack/models/llama/sku_list.py
@ -792,6 +792,13 @@ def llama3_3_instruct_models() -> List[Model]:
@lru_cache
 def safety_models() -> List[Model]:
    return [
+        Model(
+            core_model_id=CoreModelId.llama_guard_4_12b,
+            description="Llama Guard v4 12b system safety model",
+            huggingface_repo="meta-llama/Llama-Guard-4-12B",
+            arch_args={},
+            pth_file_count=1,
+        ),
        Model(
            core_model_id=CoreModelId.llama_guard_3_11b_vision,
            description="Llama Guard v3 11b vision system safety model",
--- a/llama_stack/models/llama/sku_types.py
+++ b/llama_stack/models/llama/sku_types.py
@ -81,6 +81,7 @@ class CoreModelId(Enum):
    llama_guard_2_8b = "Llama-Guard-2-8B"
    llama_guard_3_11b_vision = "Llama-Guard-3-11B-Vision"
    llama_guard_3_1b = "Llama-Guard-3-1B"
+    llama_guard_4_12b = "Llama-Guard-4-12B"


 def is_multimodal(model_id) -> bool:
@ -148,6 +149,7 @@ def model_family(model_id) -> ModelFamily:
        CoreModelId.llama_guard_2_8b,
        CoreModelId.llama_guard_3_11b_vision,
        CoreModelId.llama_guard_3_1b,
+        CoreModelId.llama_guard_4_12b,
    ]:
        return ModelFamily.safety
    else:
@ -225,5 +227,7 @@ class Model(BaseModel):
            CoreModelId.llama_guard_3_1b,
        ]:
            return 131072
+        elif self.core_model_id == CoreModelId.llama_guard_4_12b:
+            return 8192
        else:
            raise ValueError(f"Unknown max_seq_len for {self.core_model_id}")
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -23,6 +23,9 @@ from llama_stack.apis.agents import (
    Document,
    ListAgentSessionsResponse,
    ListAgentsResponse,
+    OpenAIResponseInputMessage,
+    OpenAIResponseInputTool,
+    OpenAIResponseObject,
    Session,
    Turn,
 )
@ -40,6 +43,7 @@ from llama_stack.providers.utils.kvstore import InmemoryKVStoreImpl, kvstore_imp

 from .agent_instance import ChatAgent
 from .config import MetaReferenceAgentsImplConfig
+from .openai_responses import OpenAIResponsesImpl

 logger = logging.getLogger()
 logger.setLevel(logging.INFO)
@ -63,9 +67,16 @@ class MetaReferenceAgentsImpl(Agents):
        self.tool_groups_api = tool_groups_api

        self.in_memory_store = InmemoryKVStoreImpl()
+        self.openai_responses_impl = None

    async def initialize(self) -> None:
        self.persistence_store = await kvstore_impl(self.config.persistence_store)
+        self.openai_responses_impl = OpenAIResponsesImpl(
+            self.persistence_store,
+            inference_api=self.inference_api,
+            tool_groups_api=self.tool_groups_api,
+            tool_runtime_api=self.tool_runtime_api,
+        )

        # check if "bwrap" is available
        if not shutil.which("bwrap"):
@ -244,3 +255,23 @@ class MetaReferenceAgentsImpl(Agents):
        agent_id: str,
    ) -> ListAgentSessionsResponse:
        pass
+
+    # OpenAI responses
+    async def get_openai_response(
+        self,
+        id: str,
+    ) -> OpenAIResponseObject:
+        return await self.openai_responses_impl.get_openai_response(id)
+
+    async def create_openai_response(
+        self,
+        input: Union[str, List[OpenAIResponseInputMessage]],
+        model: str,
+        previous_response_id: Optional[str] = None,
+        store: Optional[bool] = True,
+        stream: Optional[bool] = False,
+        tools: Optional[List[OpenAIResponseInputTool]] = None,
+    ) -> OpenAIResponseObject:
+        return await self.openai_responses_impl.create_openai_response(
+            input, model, previous_response_id, store, stream, tools
+        )
--- a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
+++ b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
@ -0,0 +1,319 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import uuid
+from typing import AsyncIterator, List, Optional, Union, cast
+
+from openai.types.chat import ChatCompletionToolParam
+
+from llama_stack.apis.agents.openai_responses import (
+    OpenAIResponseInputMessage,
+    OpenAIResponseInputMessageContentImage,
+    OpenAIResponseInputMessageContentText,
+    OpenAIResponseInputTool,
+    OpenAIResponseObject,
+    OpenAIResponseObjectStream,
+    OpenAIResponseObjectStreamResponseCompleted,
+    OpenAIResponseObjectStreamResponseCreated,
+    OpenAIResponseOutput,
+    OpenAIResponseOutputMessage,
+    OpenAIResponseOutputMessageContentOutputText,
+    OpenAIResponseOutputMessageWebSearchToolCall,
+)
+from llama_stack.apis.inference.inference import (
+    Inference,
+    OpenAIAssistantMessageParam,
+    OpenAIChatCompletion,
+    OpenAIChatCompletionContentPartImageParam,
+    OpenAIChatCompletionContentPartParam,
+    OpenAIChatCompletionContentPartTextParam,
+    OpenAIChatCompletionToolCallFunction,
+    OpenAIChoice,
+    OpenAIImageURL,
+    OpenAIMessageParam,
+    OpenAIToolMessageParam,
+    OpenAIUserMessageParam,
+)
+from llama_stack.apis.tools.tools import ToolGroups, ToolInvocationResult, ToolRuntime
+from llama_stack.log import get_logger
+from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
+from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
+from llama_stack.providers.utils.kvstore import KVStore
+
+logger = get_logger(name=__name__, category="openai_responses")
+
+OPENAI_RESPONSES_PREFIX = "openai_responses:"
+
+
+async def _previous_response_to_messages(previous_response: OpenAIResponseObject) -> List[OpenAIMessageParam]:
+    messages: List[OpenAIMessageParam] = []
+    for output_message in previous_response.output:
+        if isinstance(output_message, OpenAIResponseOutputMessage):
+            messages.append(OpenAIAssistantMessageParam(content=output_message.content[0].text))
+    return messages
+
+
+async def _openai_choices_to_output_messages(choices: List[OpenAIChoice]) -> List[OpenAIResponseOutputMessage]:
+    output_messages = []
+    for choice in choices:
+        output_content = ""
+        if isinstance(choice.message.content, str):
+            output_content = choice.message.content
+        elif isinstance(choice.message.content, OpenAIChatCompletionContentPartTextParam):
+            output_content = choice.message.content.text
+        # TODO: handle image content
+        output_messages.append(
+            OpenAIResponseOutputMessage(
+                id=f"msg_{uuid.uuid4()}",
+                content=[OpenAIResponseOutputMessageContentOutputText(text=output_content)],
+                status="completed",
+            )
+        )
+    return output_messages
+
+
+class OpenAIResponsesImpl:
+    def __init__(
+        self,
+        persistence_store: KVStore,
+        inference_api: Inference,
+        tool_groups_api: ToolGroups,
+        tool_runtime_api: ToolRuntime,
+    ):
+        self.persistence_store = persistence_store
+        self.inference_api = inference_api
+        self.tool_groups_api = tool_groups_api
+        self.tool_runtime_api = tool_runtime_api
+
+    async def get_openai_response(
+        self,
+        id: str,
+    ) -> OpenAIResponseObject:
+        key = f"{OPENAI_RESPONSES_PREFIX}{id}"
+        response_json = await self.persistence_store.get(key=key)
+        if response_json is None:
+            raise ValueError(f"OpenAI response with id '{id}' not found")
+        return OpenAIResponseObject.model_validate_json(response_json)
+
+    async def create_openai_response(
+        self,
+        input: Union[str, List[OpenAIResponseInputMessage]],
+        model: str,
+        previous_response_id: Optional[str] = None,
+        store: Optional[bool] = True,
+        stream: Optional[bool] = False,
+        tools: Optional[List[OpenAIResponseInputTool]] = None,
+    ):
+        stream = False if stream is None else stream
+
+        messages: List[OpenAIMessageParam] = []
+        if previous_response_id:
+            previous_response = await self.get_openai_response(previous_response_id)
+            messages.extend(await _previous_response_to_messages(previous_response))
+        # TODO: refactor this user_content parsing out into a separate method
+        user_content: Union[str, List[OpenAIChatCompletionContentPartParam]] = ""
+        if isinstance(input, list):
+            user_content = []
+            for user_input in input:
+                if isinstance(user_input.content, list):
+                    for user_input_content in user_input.content:
+                        if isinstance(user_input_content, OpenAIResponseInputMessageContentText):
+                            user_content.append(OpenAIChatCompletionContentPartTextParam(text=user_input_content.text))
+                        elif isinstance(user_input_content, OpenAIResponseInputMessageContentImage):
+                            if user_input_content.image_url:
+                                image_url = OpenAIImageURL(
+                                    url=user_input_content.image_url, detail=user_input_content.detail
+                                )
+                                user_content.append(OpenAIChatCompletionContentPartImageParam(image_url=image_url))
+                else:
+                    user_content.append(OpenAIChatCompletionContentPartTextParam(text=user_input.content))
+        else:
+            user_content = input
+        messages.append(OpenAIUserMessageParam(content=user_content))
+
+        chat_tools = await self._convert_response_tools_to_chat_tools(tools) if tools else None
+        chat_response = await self.inference_api.openai_chat_completion(
+            model=model,
+            messages=messages,
+            tools=chat_tools,
+            stream=stream,
+        )
+
+        if stream:
+            # TODO: refactor this into a separate method that handles streaming
+            chat_response_id = ""
+            chat_response_content = []
+            # TODO: these chunk_ fields are hacky and only take the last chunk into account
+            chunk_created = 0
+            chunk_model = ""
+            chunk_finish_reason = ""
+            async for chunk in chat_response:
+                chat_response_id = chunk.id
+                chunk_created = chunk.created
+                chunk_model = chunk.model
+                for chunk_choice in chunk.choices:
+                    # TODO: this only works for text content
+                    chat_response_content.append(chunk_choice.delta.content or "")
+                    if chunk_choice.finish_reason:
+                        chunk_finish_reason = chunk_choice.finish_reason
+            assistant_message = OpenAIAssistantMessageParam(content="".join(chat_response_content))
+            chat_response = OpenAIChatCompletion(
+                id=chat_response_id,
+                choices=[
+                    OpenAIChoice(
+                        message=assistant_message,
+                        finish_reason=chunk_finish_reason,
+                        index=0,
+                    )
+                ],
+                created=chunk_created,
+                model=chunk_model,
+            )
+        else:
+            # dump and reload to map to our pydantic types
+            chat_response = OpenAIChatCompletion(**chat_response.model_dump())
+
+        output_messages: List[OpenAIResponseOutput] = []
+        if chat_response.choices[0].message.tool_calls:
+            output_messages.extend(
+                await self._execute_tool_and_return_final_output(model, stream, chat_response, messages)
+            )
+        else:
+            output_messages.extend(await _openai_choices_to_output_messages(chat_response.choices))
+        response = OpenAIResponseObject(
+            created_at=chat_response.created,
+            id=f"resp-{uuid.uuid4()}",
+            model=model,
+            object="response",
+            status="completed",
+            output=output_messages,
+        )
+
+        if store:
+            # Store in kvstore
+            key = f"{OPENAI_RESPONSES_PREFIX}{response.id}"
+            await self.persistence_store.set(
+                key=key,
+                value=response.model_dump_json(),
+            )
+
+        if stream:
+
+            async def async_response() -> AsyncIterator[OpenAIResponseObjectStream]:
+                # TODO: response created should actually get emitted much earlier in the process
+                yield OpenAIResponseObjectStreamResponseCreated(response=response)
+                yield OpenAIResponseObjectStreamResponseCompleted(response=response)
+
+            return async_response()
+
+        return response
+
+    async def _convert_response_tools_to_chat_tools(
+        self, tools: List[OpenAIResponseInputTool]
+    ) -> List[ChatCompletionToolParam]:
+        chat_tools: List[ChatCompletionToolParam] = []
+        for input_tool in tools:
+            # TODO: Handle other tool types
+            if input_tool.type == "web_search":
+                tool_name = "web_search"
+                tool = await self.tool_groups_api.get_tool(tool_name)
+                tool_def = ToolDefinition(
+                    tool_name=tool_name,
+                    description=tool.description,
+                    parameters={
+                        param.name: ToolParamDefinition(
+                            param_type=param.parameter_type,
+                            description=param.description,
+                            required=param.required,
+                            default=param.default,
+                        )
+                        for param in tool.parameters
+                    },
+                )
+                chat_tool = convert_tooldef_to_openai_tool(tool_def)
+                chat_tools.append(chat_tool)
+            else:
+                raise ValueError(f"Llama Stack OpenAI Responses does not yet support tool type: {input_tool.type}")
+        return chat_tools
+
+    async def _execute_tool_and_return_final_output(
+        self, model_id: str, stream: bool, chat_response: OpenAIChatCompletion, messages: List[OpenAIMessageParam]
+    ) -> List[OpenAIResponseOutput]:
+        output_messages: List[OpenAIResponseOutput] = []
+        choice = chat_response.choices[0]
+
+        # If the choice is not an assistant message, we don't need to execute any tools
+        if not isinstance(choice.message, OpenAIAssistantMessageParam):
+            return output_messages
+
+        # If the assistant message doesn't have any tool calls, we don't need to execute any tools
+        if not choice.message.tool_calls:
+            return output_messages
+
+        # Add the assistant message with tool_calls response to the messages list
+        messages.append(choice.message)
+
+        for tool_call in choice.message.tool_calls:
+            tool_call_id = tool_call.id
+            function = tool_call.function
+
+            # If for some reason the tool call doesn't have a function or id, we can't execute it
+            if not function or not tool_call_id:
+                continue
+
+            # TODO: telemetry spans for tool calls
+            result = await self._execute_tool_call(function)
+
+            # Handle tool call failure
+            if not result:
+                output_messages.append(
+                    OpenAIResponseOutputMessageWebSearchToolCall(
+                        id=tool_call_id,
+                        status="failed",
+                    )
+                )
+                continue
+
+            output_messages.append(
+                OpenAIResponseOutputMessageWebSearchToolCall(
+                    id=tool_call_id,
+                    status="completed",
+                ),
+            )
+
+            result_content = ""
+            # TODO: handle other result content types and lists
+            if isinstance(result.content, str):
+                result_content = result.content
+            messages.append(OpenAIToolMessageParam(content=result_content, tool_call_id=tool_call_id))
+
+        tool_results_chat_response = await self.inference_api.openai_chat_completion(
+            model=model_id,
+            messages=messages,
+            stream=stream,
+        )
+        # type cast to appease mypy
+        tool_results_chat_response = cast(OpenAIChatCompletion, tool_results_chat_response)
+        tool_final_outputs = await _openai_choices_to_output_messages(tool_results_chat_response.choices)
+        # TODO: Wire in annotations with URLs, titles, etc to these output messages
+        output_messages.extend(tool_final_outputs)
+        return output_messages
+
+    async def _execute_tool_call(
+        self,
+        function: OpenAIChatCompletionToolCallFunction,
+    ) -> Optional[ToolInvocationResult]:
+        if not function.name:
+            return None
+        function_args = json.loads(function.arguments) if function.arguments else {}
+        logger.info(f"executing tool call: {function.name} with args: {function_args}")
+        result = await self.tool_runtime_api.invoke_tool(
+            tool_name=function.name,
+            kwargs=function_args,
+        )
+        logger.debug(f"tool call {function.name} completed with result: {result}")
+        return result
--- a/llama_stack/providers/inline/post_training/common/validator.py
+++ b/llama_stack/providers/inline/post_training/common/validator.py
@ -17,10 +17,8 @@ from llama_stack.apis.common.type_system import (
    DialogType,
    StringType,
 )
-from llama_stack.apis.datasets import Datasets
 from llama_stack.providers.utils.common.data_schema_validator import (
    ColumnName,
-    validate_dataset_schema,
 )

 EXPECTED_DATASET_SCHEMA: dict[str, list[dict[str, Any]]] = {
@ -36,21 +34,3 @@ EXPECTED_DATASET_SCHEMA: dict[str, list[dict[str, Any]]] = {
        }
    ],
 }
-
-
-async def validate_input_dataset_schema(
-    datasets_api: Datasets,
-    dataset_id: str,
-    dataset_type: str,
-) -> None:
-    dataset_def = await datasets_api.get_dataset(dataset_id=dataset_id)
-    if not dataset_def:
-        raise ValueError(f"Dataset {dataset_id} does not exist.")
-
-    if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0:
-        raise ValueError(f"Dataset {dataset_id} does not have a schema defined.")
-
-    if dataset_type not in EXPECTED_DATASET_SCHEMA:
-        raise ValueError(f"Dataset type {dataset_type} is not supported.")
-
-    validate_dataset_schema(dataset_def.dataset_schema, EXPECTED_DATASET_SCHEMA[dataset_type])
--- a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
+++ b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
@ -48,9 +48,6 @@ from llama_stack.apis.post_training import (
 from llama_stack.distribution.utils.config_dirs import DEFAULT_CHECKPOINT_DIR
 from llama_stack.distribution.utils.model_utils import model_local_dir
 from llama_stack.models.llama.sku_list import resolve_model
-from llama_stack.providers.inline.post_training.common.validator import (
-    validate_input_dataset_schema,
-)
 from llama_stack.providers.inline.post_training.torchtune.common import utils
 from llama_stack.providers.inline.post_training.torchtune.common.checkpointer import (
    TorchtuneCheckpointer,
@ -348,11 +345,9 @@ class LoraFinetuningSingleDevice:
        all_rows = await fetch_rows(dataset_id)
        rows = all_rows.data

-        await validate_input_dataset_schema(
-            datasets_api=self.datasets_api,
-            dataset_id=dataset_id,
-            dataset_type=self._data_format.value,
-        )
+        # TODO (xiyan): validate dataset schema
+        # dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
+
        data_transform = await utils.get_data_transform(self._data_format)
        ds = SFTDataset(
            rows,
--- a/llama_stack/providers/inline/telemetry/meta_reference/config.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/config.py
@ -30,7 +30,7 @@ class TelemetryConfig(BaseModel):
    )
    service_name: str = Field(
        # service name is always the same, use zero-width space to avoid clutter
-        default="",
+        default="",
        description="The service name to use for telemetry",
    )
    sinks: List[TelemetrySink] = Field(
@ -52,7 +52,7 @@ class TelemetryConfig(BaseModel):
    @classmethod
    def sample_run_config(cls, __distro_dir__: str, db_name: str = "trace_store.db") -> Dict[str, Any]:
        return {
-            "service_name": "${env.OTEL_SERVICE_NAME:}",
+            "service_name": "${env.OTEL_SERVICE_NAME:}",
            "sinks": "${env.TELEMETRY_SINKS:console,sqlite}",
-            "sqlite_db_path": "${env.SQLITE_DB_PATH:" + __distro_dir__ + "/" + db_name + "}",
+            "sqlite_db_path": "${env.SQLITE_STORE_DIR:" + __distro_dir__ + "}/" + db_name,
        }
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -227,6 +227,16 @@ def available_providers() -> List[ProviderSpec]:
                provider_data_validator="llama_stack.providers.remote.inference.fireworks_openai_compat.config.FireworksProviderDataValidator",
            ),
        ),
+        remote_provider_spec(
+            api=Api.inference,
+            adapter=AdapterSpec(
+                adapter_type="llama-openai-compat",
+                pip_packages=["litellm"],
+                module="llama_stack.providers.remote.inference.llama_openai_compat",
+                config_class="llama_stack.providers.remote.inference.llama_openai_compat.config.LlamaCompatConfig",
+                provider_data_validator="llama_stack.providers.remote.inference.llama_openai_compat.config.LlamaProviderDataValidator",
+            ),
+        ),
        remote_provider_spec(
            api=Api.inference,
            adapter=AdapterSpec(
--- a/llama_stack/providers/remote/inference/llama_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/init.py
@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.inference import Inference
+
+from .config import LlamaCompatConfig
+
+
+async def get_adapter_impl(config: LlamaCompatConfig, _deps) -> Inference:
+    # import dynamically so the import is used only when it is needed
+    from .llama import LlamaCompatInferenceAdapter
+
+    adapter = LlamaCompatInferenceAdapter(config)
+    return adapter
--- a/llama_stack/providers/remote/inference/llama_openai_compat/config.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/config.py
@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, Field
+
+from llama_stack.schema_utils import json_schema_type
+
+
+class LlamaProviderDataValidator(BaseModel):
+    llama_api_key: Optional[str] = Field(
+        default=None,
+        description="API key for api.llama models",
+    )
+
+
+@json_schema_type
+class LlamaCompatConfig(BaseModel):
+    api_key: Optional[str] = Field(
+        default=None,
+        description="The Llama API key",
+    )
+
+    openai_compat_api_base: str = Field(
+        default="https://api.llama.com/compat/v1/",
+        description="The URL for the Llama API server",
+    )
+
+    @classmethod
+    def sample_run_config(cls, api_key: str = "${env.LLAMA_API_KEY}", **kwargs) -> Dict[str, Any]:
+        return {
+            "openai_compat_api_base": "https://api.llama.com/compat/v1/",
+            "api_key": api_key,
+        }
--- a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
@ -0,0 +1,34 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.providers.remote.inference.llama_openai_compat.config import (
+    LlamaCompatConfig,
+)
+from llama_stack.providers.utils.inference.litellm_openai_mixin import (
+    LiteLLMOpenAIMixin,
+)
+
+from .models import MODEL_ENTRIES
+
+
+class LlamaCompatInferenceAdapter(LiteLLMOpenAIMixin):
+    _config: LlamaCompatConfig
+
+    def __init__(self, config: LlamaCompatConfig):
+        LiteLLMOpenAIMixin.__init__(
+            self,
+            model_entries=MODEL_ENTRIES,
+            api_key_from_config=config.api_key,
+            provider_data_api_key_field="llama_api_key",
+            openai_compat_api_base=config.openai_compat_api_base,
+        )
+        self.config = config
+
+    async def initialize(self):
+        await super().initialize()
+
+    async def shutdown(self):
+        await super().shutdown()
--- a/llama_stack/providers/remote/inference/llama_openai_compat/models.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/models.py
@ -0,0 +1,25 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.models.llama.sku_types import CoreModelId
+from llama_stack.providers.utils.inference.model_registry import (
+    build_hf_repo_model_entry,
+)
+
+MODEL_ENTRIES = [
+    build_hf_repo_model_entry(
+        "Llama-3.3-70B-Instruct",
+        CoreModelId.llama3_3_70b_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "Llama-4-Scout-17B-16E-Instruct-FP8",
+        CoreModelId.llama4_scout_17b_16e_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "Llama-4-Maverick-17B-128E-Instruct-FP8",
+        CoreModelId.llama4_maverick_17b_128e_instruct.value,
+    ),
+]
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -433,6 +433,12 @@ class OllamaInferenceAdapter(
        user: Optional[str] = None,
    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
        model_obj = await self._get_model(model)
+
+        # ollama still makes tool calls even when tool_choice is "none"
+        # so we need to remove the tools in that case
+        if tool_choice == "none" and tools is not None:
+            tools = None
+
        params = {
            k: v
            for k, v in {
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@ -90,6 +90,9 @@ class LiteLLMOpenAIMixin(
            raise ValueError(f"Unsupported model: {model.provider_resource_id}")
        return model

+    def get_litellm_model_name(self, model_id: str) -> str:
+        return "openai/" + model_id if self.is_openai_compat else model_id
+
    async def completion(
        self,
        model_id: str,
@ -130,8 +133,7 @@ class LiteLLMOpenAIMixin(
        )

        params = await self._get_params(request)
-        if self.is_openai_compat:
-            params["model"] = "openai/" + params["model"]
+        params["model"] = self.get_litellm_model_name(params["model"])

        logger.debug(f"params to litellm (openai compat): {params}")
        # unfortunately, we need to use synchronous litellm.completion here because litellm
@ -220,21 +222,23 @@ class LiteLLMOpenAIMixin(
                    else request.tool_config.tool_choice
                )

+        return {
+            "model": request.model,
+            "api_key": self.get_api_key(),
+            "api_base": self.api_base,
+            **input_dict,
+            "stream": request.stream,
+            **get_sampling_options(request.sampling_params),
+        }
+
+    def get_api_key(self) -> str:
        provider_data = self.get_request_provider_data()
        key_field = self.provider_data_api_key_field
        if provider_data and getattr(provider_data, key_field, None):
            api_key = getattr(provider_data, key_field)
        else:
            api_key = self.api_key_from_config
-
-        return {
-            "model": request.model,
-            "api_key": api_key,
-            "api_base": self.api_base,
-            **input_dict,
-            "stream": request.stream,
-            **get_sampling_options(request.sampling_params),
-        }
+        return api_key

    async def embeddings(
        self,
@ -247,7 +251,7 @@ class LiteLLMOpenAIMixin(
        model = await self.model_store.get_model(model_id)

        response = litellm.embedding(
-            model=model.provider_resource_id,
+            model=self.get_litellm_model_name(model.provider_resource_id),
            input=[interleaved_content_as_str(content) for content in contents],
        )

@ -278,7 +282,7 @@ class LiteLLMOpenAIMixin(
    ) -> OpenAICompletion:
        model_obj = await self.model_store.get_model(model)
        params = await prepare_openai_completion_params(
-            model=model_obj.provider_resource_id,
+            model=self.get_litellm_model_name(model_obj.provider_resource_id),
            prompt=prompt,
            best_of=best_of,
            echo=echo,
@ -297,6 +301,8 @@ class LiteLLMOpenAIMixin(
            user=user,
            guided_choice=guided_choice,
            prompt_logprobs=prompt_logprobs,
+            api_key=self.get_api_key(),
+            api_base=self.api_base,
        )
        return await litellm.atext_completion(**params)

@ -328,7 +334,7 @@ class LiteLLMOpenAIMixin(
    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
        model_obj = await self.model_store.get_model(model)
        params = await prepare_openai_completion_params(
-            model=model_obj.provider_resource_id,
+            model=self.get_litellm_model_name(model_obj.provider_resource_id),
            messages=messages,
            frequency_penalty=frequency_penalty,
            function_call=function_call,
@ -351,6 +357,8 @@ class LiteLLMOpenAIMixin(
            top_logprobs=top_logprobs,
            top_p=top_p,
            user=user,
+            api_key=self.get_api_key(),
+            api_base=self.api_base,
        )
        return await litellm.acompletion(**params)

--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -638,10 +638,13 @@ async def convert_message_to_openai_dict_new(
            )
            for tool in message.tool_calls
        ]
+        params = {}
+        if tool_calls:
+            params["tool_calls"] = tool_calls
        out = OpenAIChatCompletionAssistantMessage(
            role="assistant",
            content=await _convert_message_content(message.content),
-            tool_calls=tool_calls or None,
+            **params,
        )
    elif isinstance(message, ToolResponseMessage):
        out = OpenAIChatCompletionToolMessage(
--- a/llama_stack/strong_typing/schema.py
+++ b/llama_stack/strong_typing/schema.py
@ -478,6 +478,8 @@ class JsonSchemaGenerator:
                }
            return ret
        elif origin_type is Literal:
+            if len(typing.get_args(typ)) != 1:
+                raise ValueError(f"Literal type {typ} has {len(typing.get_args(typ))} arguments")
            (literal_value,) = typing.get_args(typ)  # unpack value of literal type
            schema = self.type_to_schema(type(literal_value))
            schema["const"] = literal_value
--- a/llama_stack/templates/bedrock/run.yaml
+++ b/llama_stack/templates/bedrock/run.yaml
@ -39,9 +39,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/bedrock/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/cerebras/run.yaml
+++ b/llama_stack/templates/cerebras/run.yaml
@ -79,9 +79,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/cerebras/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/trace_store.db
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
--- a/llama_stack/templates/ci-tests/run.yaml
+++ b/llama_stack/templates/ci-tests/run.yaml
@ -42,9 +42,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ci-tests/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ci-tests}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/dell/run-with-safety.yaml
+++ b/llama_stack/templates/dell/run-with-safety.yaml
@ -45,9 +45,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/dell/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/dell/run.yaml
+++ b/llama_stack/templates/dell/run.yaml
@ -41,9 +41,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/dell/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/dependencies.json
+++ b/llama_stack/templates/dependencies.json
@ -344,6 +344,45 @@
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
+  "llama_api": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "emoji",
+    "fastapi",
+    "fire",
+    "httpx",
+    "langdetect",
+    "litellm",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pymongo",
+    "pypdf",
+    "pythainlp",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "sqlite-vec",
+    "tqdm",
+    "transformers",
+    "tree_sitter",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
  "meta-reference-gpu": [
    "accelerate",
    "aiosqlite",
--- a/llama_stack/templates/dev/run.yaml
+++ b/llama_stack/templates/dev/run.yaml
@ -71,9 +71,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/dev/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/fireworks/run-with-safety.yaml
+++ b/llama_stack/templates/fireworks/run-with-safety.yaml
@ -50,9 +50,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/fireworks/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@ -45,9 +45,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/fireworks/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/groq/run.yaml
+++ b/llama_stack/templates/groq/run.yaml
@ -45,9 +45,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/groq/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/groq}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml
+++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
@ -50,9 +50,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-endpoint/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/hf-endpoint/run.yaml
+++ b/llama_stack/templates/hf-endpoint/run.yaml
@ -45,9 +45,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-endpoint/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/hf-serverless/run-with-safety.yaml
+++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml
@ -50,9 +50,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-serverless/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/hf-serverless/run.yaml
+++ b/llama_stack/templates/hf-serverless/run.yaml
@ -45,9 +45,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-serverless/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/llama_api/init.py
+++ b/llama_stack/templates/llama_api/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .llama_api import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/llama_api/build.yaml
+++ b/llama_stack/templates/llama_api/build.yaml
@ -0,0 +1,33 @@
+version: '2'
+distribution_spec:
+  description: Distribution for running e2e tests in CI
+  providers:
+    inference:
+    - remote::llama-openai-compat
+    - inline::sentence-transformers
+    vector_io:
+    - inline::sqlite-vec
+    - remote::chromadb
+    - remote::pgvector
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+    eval:
+    - inline::meta-reference
+    datasetio:
+    - remote::huggingface
+    - inline::localfs
+    scoring:
+    - inline::basic
+    - inline::llm-as-judge
+    - inline::braintrust
+    tool_runtime:
+    - remote::brave-search
+    - remote::tavily-search
+    - inline::code-interpreter
+    - inline::rag-runtime
+    - remote::model-context-protocol
+image_type: conda
--- a/llama_stack/templates/llama_api/llama_api.py
+++ b/llama_stack/templates/llama_api/llama_api.py
@ -0,0 +1,159 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List, Tuple
+
+from llama_stack.apis.models.models import ModelType
+from llama_stack.distribution.datatypes import (
+    ModelInput,
+    Provider,
+    ShieldInput,
+    ToolGroupInput,
+)
+from llama_stack.providers.inline.inference.sentence_transformers import (
+    SentenceTransformersInferenceConfig,
+)
+from llama_stack.providers.inline.vector_io.sqlite_vec.config import (
+    SQLiteVectorIOConfig,
+)
+from llama_stack.providers.remote.inference.llama_openai_compat.config import (
+    LlamaCompatConfig,
+)
+from llama_stack.providers.remote.inference.llama_openai_compat.models import (
+    MODEL_ENTRIES as LLLAMA_MODEL_ENTRIES,
+)
+from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
+from llama_stack.providers.remote.vector_io.pgvector.config import (
+    PGVectorVectorIOConfig,
+)
+from llama_stack.templates.template import (
+    DistributionTemplate,
+    RunConfigSettings,
+    get_model_registry,
+)
+
+
+def get_inference_providers() -> Tuple[List[Provider], List[ModelInput]]:
+    # in this template, we allow each API key to be optional
+    providers = [
+        (
+            "llama-openai-compat",
+            LLLAMA_MODEL_ENTRIES,
+            LlamaCompatConfig.sample_run_config(api_key="${env.LLAMA_API_KEY:}"),
+        ),
+    ]
+    inference_providers = []
+    available_models = {}
+    for provider_id, model_entries, config in providers:
+        inference_providers.append(
+            Provider(
+                provider_id=provider_id,
+                provider_type=f"remote::{provider_id}",
+                config=config,
+            )
+        )
+        available_models[provider_id] = model_entries
+    return inference_providers, available_models
+
+
+def get_distribution_template() -> DistributionTemplate:
+    inference_providers, available_models = get_inference_providers()
+    providers = {
+        "inference": ([p.provider_type for p in inference_providers] + ["inline::sentence-transformers"]),
+        "vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+        "eval": ["inline::meta-reference"],
+        "datasetio": ["remote::huggingface", "inline::localfs"],
+        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
+        "tool_runtime": [
+            "remote::brave-search",
+            "remote::tavily-search",
+            "inline::code-interpreter",
+            "inline::rag-runtime",
+            "remote::model-context-protocol",
+        ],
+    }
+    name = "llama_api"
+
+    vector_io_providers = [
+        Provider(
+            provider_id="sqlite-vec",
+            provider_type="inline::sqlite-vec",
+            config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+        ),
+        Provider(
+            provider_id="${env.ENABLE_CHROMADB+chromadb}",
+            provider_type="remote::chromadb",
+            config=ChromaVectorIOConfig.sample_run_config(url="${env.CHROMADB_URL:}"),
+        ),
+        Provider(
+            provider_id="${env.ENABLE_PGVECTOR+pgvector}",
+            provider_type="remote::pgvector",
+            config=PGVectorVectorIOConfig.sample_run_config(
+                db="${env.PGVECTOR_DB:}",
+                user="${env.PGVECTOR_USER:}",
+                password="${env.PGVECTOR_PASSWORD:}",
+            ),
+        ),
+    ]
+    embedding_provider = Provider(
+        provider_id="sentence-transformers",
+        provider_type="inline::sentence-transformers",
+        config=SentenceTransformersInferenceConfig.sample_run_config(),
+    )
+
+    default_tool_groups = [
+        ToolGroupInput(
+            toolgroup_id="builtin::websearch",
+            provider_id="tavily-search",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::rag",
+            provider_id="rag-runtime",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::code_interpreter",
+            provider_id="code-interpreter",
+        ),
+    ]
+    embedding_model = ModelInput(
+        model_id="all-MiniLM-L6-v2",
+        provider_id=embedding_provider.provider_id,
+        model_type=ModelType.embedding,
+        metadata={
+            "embedding_dimension": 384,
+        },
+    )
+
+    default_models = get_model_registry(available_models)
+    return DistributionTemplate(
+        name=name,
+        distro_type="self_hosted",
+        description="Distribution for running e2e tests in CI",
+        container_image=None,
+        template_path=None,
+        providers=providers,
+        available_models_by_provider=available_models,
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": inference_providers + [embedding_provider],
+                    "vector_io": vector_io_providers,
+                },
+                default_models=default_models + [embedding_model],
+                default_tool_groups=default_tool_groups,
+                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
+            ),
+        },
+        run_config_env_vars={
+            "LLAMA_STACK_PORT": (
+                "8321",
+                "Port for the Llama Stack distribution server",
+            ),
+        },
+    )
--- a/llama_stack/templates/llama_api/run.yaml
+++ b/llama_stack/templates/llama_api/run.yaml
@ -0,0 +1,167 @@
+version: '2'
+image_name: llama_api
+apis:
+- agents
+- datasetio
+- eval
+- inference
+- safety
+- scoring
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: llama-openai-compat
+    provider_type: remote::llama-openai-compat
+    config:
+      openai_compat_api_base: https://api.llama.com/compat/v1/
+      api_key: ${env.LLAMA_API_KEY:}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+    config: {}
+  vector_io:
+  - provider_id: sqlite-vec
+    provider_type: inline::sqlite-vec
+    config:
+      db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llama_api}/sqlite_vec.db
+  - provider_id: ${env.ENABLE_CHROMADB+chromadb}
+    provider_type: remote::chromadb
+    config:
+      url: ${env.CHROMADB_URL:}
+  - provider_id: ${env.ENABLE_PGVECTOR+pgvector}
+    provider_type: remote::pgvector
+    config:
+      host: ${env.PGVECTOR_HOST:localhost}
+      port: ${env.PGVECTOR_PORT:5432}
+      db: ${env.PGVECTOR_DB:}
+      user: ${env.PGVECTOR_USER:}
+      password: ${env.PGVECTOR_PASSWORD:}
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llama_api}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: ${env.OTEL_SERVICE_NAME:}
+      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llama_api}/trace_store.db
+  eval:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llama_api}/meta_reference_eval.db
+  datasetio:
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llama_api}/huggingface_datasetio.db
+  - provider_id: localfs
+    provider_type: inline::localfs
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llama_api}/localfs_datasetio.db
+  scoring:
+  - provider_id: basic
+    provider_type: inline::basic
+    config: {}
+  - provider_id: llm-as-judge
+    provider_type: inline::llm-as-judge
+    config: {}
+  - provider_id: braintrust
+    provider_type: inline::braintrust
+    config:
+      openai_api_key: ${env.OPENAI_API_KEY:}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: code-interpreter
+    provider_type: inline::code-interpreter
+    config: {}
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+    config: {}
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+    config: {}
+metadata_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llama_api}/registry.db
+models:
+- metadata: {}
+  model_id: Llama-3.3-70B-Instruct
+  provider_id: llama-openai-compat
+  provider_model_id: Llama-3.3-70B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.3-70B-Instruct
+  provider_id: llama-openai-compat
+  provider_model_id: Llama-3.3-70B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: Llama-4-Scout-17B-16E-Instruct-FP8
+  provider_id: llama-openai-compat
+  provider_model_id: Llama-4-Scout-17B-16E-Instruct-FP8
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
+  provider_id: llama-openai-compat
+  provider_model_id: Llama-4-Scout-17B-16E-Instruct-FP8
+  model_type: llm
+- metadata: {}
+  model_id: Llama-4-Maverick-17B-128E-Instruct-FP8
+  provider_id: llama-openai-compat
+  provider_model_id: Llama-4-Maverick-17B-128E-Instruct-FP8
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+  provider_id: llama-openai-compat
+  provider_model_id: Llama-4-Maverick-17B-128E-Instruct-FP8
+  model_type: llm
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: sentence-transformers
+  model_type: embedding
+shields:
+- shield_id: meta-llama/Llama-Guard-3-8B
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
+- toolgroup_id: builtin::code_interpreter
+  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
@ -60,9 +60,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/meta-reference-gpu/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/meta-reference-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run.yaml
@ -50,9 +50,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/meta-reference-gpu/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/nvidia/run-with-safety.yaml
+++ b/llama_stack/templates/nvidia/run-with-safety.yaml
@ -50,9 +50,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/trace_store.db
  eval:
  - provider_id: nvidia
    provider_type: remote::nvidia
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@ -45,9 +45,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/trace_store.db
  eval:
  - provider_id: nvidia
    provider_type: remote::nvidia
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@ -43,9 +43,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ollama/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@ -41,9 +41,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ollama/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@ -68,9 +68,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/open-benchmark/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/passthrough/run-with-safety.yaml
+++ b/llama_stack/templates/passthrough/run-with-safety.yaml
@ -50,9 +50,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/passthrough/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/passthrough/run.yaml
+++ b/llama_stack/templates/passthrough/run.yaml
@ -45,9 +45,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/passthrough/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@ -88,9 +88,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/remote-vllm/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/trace_store.db
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@ -81,9 +81,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/remote-vllm/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/trace_store.db
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
--- a/llama_stack/templates/sambanova/run.yaml
+++ b/llama_stack/templates/sambanova/run.yaml
@ -51,9 +51,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/sambanova/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/sambanova}/trace_store.db
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
--- a/llama_stack/templates/tgi/run-with-safety.yaml
+++ b/llama_stack/templates/tgi/run-with-safety.yaml
@ -45,9 +45,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/tgi/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/tgi/run.yaml
+++ b/llama_stack/templates/tgi/run.yaml
@ -44,9 +44,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/tgi/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/together/run-with-safety.yaml
+++ b/llama_stack/templates/together/run-with-safety.yaml
@ -50,9 +50,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/together/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/together/run.yaml
+++ b/llama_stack/templates/together/run.yaml
@ -45,9 +45,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/together/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/verification/run.yaml
+++ b/llama_stack/templates/verification/run.yaml
@ -78,9 +78,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/verification/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@ -49,9 +49,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/vllm-gpu/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/templates/watsonx/run.yaml
+++ b/llama_stack/templates/watsonx/run.yaml
@ -43,9 +43,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/watsonx/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/trace_store.db
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "llama_stack"
-version = "0.2.1"
+version = "0.2.4"
 authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
 description = "Llama Stack"
 readme = "README.md"
@ -27,7 +27,7 @@ dependencies = [
    "huggingface-hub",
    "jinja2>=3.1.6",
    "jsonschema",
-    "llama-stack-client>=0.2.2",
+    "llama-stack-client>=0.2.4",
    "openai>=1.66",
    "prompt-toolkit",
    "python-dotenv",
@ -39,6 +39,7 @@ dependencies = [
    "tiktoken",
    "pillow",
    "h11>=0.16.0",
+    "kubernetes",
 ]

 [project.optional-dependencies]
@ -67,7 +68,7 @@ unit = [
    "pypdf",
    "chardet",
    "qdrant-client",
-    "opentelemetry-exporter-otlp-proto-http"
+    "opentelemetry-exporter-otlp-proto-http",
 ]
 # These are the core dependencies required for running integration tests. They are shared across all
 # providers. If a provider requires additional dependencies, please add them to your environment
@ -104,7 +105,7 @@ codegen = ["rich", "pydantic", "jinja2>=3.1.6"]
 ui = [
    "streamlit",
    "pandas",
-    "llama-stack-client>=0.2.1",
+    "llama-stack-client>=0.2.4",
    "streamlit-option-menu",
 ]

@ -320,6 +321,7 @@ exclude = [
    "^llama_stack/strong_typing/serializer\\.py$",
    "^llama_stack/templates/dev/dev\\.py$",
    "^llama_stack/templates/groq/groq\\.py$",
+    "^llama_stack/templates/llama_api/llama_api\\.py$",
    "^llama_stack/templates/sambanova/sambanova\\.py$",
    "^llama_stack/templates/template\\.py$",
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -4,15 +4,18 @@ annotated-types==0.7.0
 anyio==4.8.0
 attrs==25.1.0
 blobfile==3.0.0
+cachetools==5.5.2
 certifi==2025.1.31
 charset-normalizer==3.4.1
 click==8.1.8
 colorama==0.4.6 ; sys_platform == 'win32'
 distro==1.9.0
+durationpy==0.9
 exceptiongroup==1.2.2 ; python_full_version < '3.11'
 filelock==3.17.0
 fire==0.7.0
 fsspec==2024.12.0
+google-auth==2.38.0
 h11==0.16.0
 httpcore==1.0.9
 httpx==0.28.1
@ -22,18 +25,22 @@ jinja2==3.1.6
 jiter==0.8.2
 jsonschema==4.23.0
 jsonschema-specifications==2024.10.1
-llama-stack-client==0.2.2
+kubernetes==32.0.1
+llama-stack-client==0.2.4
 lxml==5.3.1
 markdown-it-py==3.0.0
 markupsafe==3.0.2
 mdurl==0.1.2
 numpy==2.2.3
+oauthlib==3.2.2
 openai==1.71.0
 packaging==24.2
 pandas==2.2.3
 pillow==11.1.0
 prompt-toolkit==3.0.50
 pyaml==25.1.0
+pyasn1==0.6.1
+pyasn1-modules==0.4.2
 pycryptodomex==3.21.0
 pydantic==2.10.6
 pydantic-core==2.27.2
@ -45,8 +52,10 @@ pyyaml==6.0.2
 referencing==0.36.2
 regex==2024.11.6
 requests==2.32.3
+requests-oauthlib==2.0.0
 rich==13.9.4
 rpds-py==0.22.3
+rsa==4.9
 setuptools==75.8.0
 six==1.17.0
 sniffio==1.3.1
@ -57,3 +66,4 @@ typing-extensions==4.12.2
 tzdata==2025.1
 urllib3==2.3.0
 wcwidth==0.2.13
+websocket-client==1.8.0
--- a/tests/external-provider/llama-stack-provider-ollama/run.yaml
+++ b/tests/external-provider/llama-stack-provider-ollama/run.yaml
@ -24,9 +24,9 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ollama/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/trace_store.db
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
--- a/tests/integration/fixtures/common.py
+++ b/tests/integration/fixtures/common.py
@ -14,6 +14,7 @@ from pathlib import Path
 import pytest
 import yaml
 from llama_stack_client import LlamaStackClient
+from openai import OpenAI

 from llama_stack import LlamaStackAsLibraryClient
 from llama_stack.apis.datatypes import Api
@ -207,3 +208,9 @@ def llama_stack_client(request, provider_data, text_model_id):
        raise RuntimeError("Initialization failed")

    return client
+
+
+@pytest.fixture(scope="session")
+def openai_client(client_with_models):
+    base_url = f"{client_with_models.base_url}/v1/openai/v1"
+    return OpenAI(base_url=base_url, api_key="fake")
--- a/tests/integration/test_cases/openai/responses.json
+++ b/tests/integration/test_cases/openai/responses.json
@ -0,0 +1,37 @@
+{
+  "non_streaming_01": {
+    "data": {
+      "question": "Which planet do humans live on?",
+      "expected": "Earth"
+    }
+  },
+  "non_streaming_02": {
+    "data": {
+      "question": "Which planet has rings around it with a name starting with letter S?",
+      "expected": "Saturn"
+    }
+  },
+  "streaming_01": {
+    "data": {
+      "question": "What's the name of the Sun in latin?",
+      "expected": "Sol"
+    }
+  },
+  "streaming_02": {
+    "data": {
+      "question": "What is the name of the US captial?",
+      "expected": "Washington"
+    }
+  },
+  "tools_web_search_01": {
+    "data": {
+      "input": "How many experts does the Llama 4 Maverick model have?",
+      "tools": [
+        {
+          "type": "web_search"
+        }
+      ],
+      "expected": "128"
+    }
+  }
+}
--- a/tests/integration/test_cases/test_case.py
+++ b/tests/integration/test_cases/test_case.py
@ -12,6 +12,7 @@ class TestCase:
    _apis = [
        "inference/chat_completion",
        "inference/completion",
+        "openai/responses",
    ]
    _jsonblob = {}

--- a/tests/unit/distribution/routers/test_routing_tables.py
+++ b/tests/unit/distribution/routers/test_routing_tables.py
@ -0,0 +1,314 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Unit tests for the routing tables
+
+from unittest.mock import AsyncMock
+
+import pytest
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.datasets.datasets import Dataset, DatasetPurpose, URIDataSource
+from llama_stack.apis.datatypes import Api
+from llama_stack.apis.models.models import Model, ModelType
+from llama_stack.apis.shields.shields import Shield
+from llama_stack.apis.tools import ListToolDefsResponse, ToolDef, ToolParameter
+from llama_stack.apis.vector_dbs.vector_dbs import VectorDB
+from llama_stack.distribution.routers.routing_tables import (
+    BenchmarksRoutingTable,
+    DatasetsRoutingTable,
+    ModelsRoutingTable,
+    ScoringFunctionsRoutingTable,
+    ShieldsRoutingTable,
+    ToolGroupsRoutingTable,
+    VectorDBsRoutingTable,
+)
+from llama_stack.distribution.store.registry import CachedDiskDistributionRegistry
+from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
+from llama_stack.providers.utils.kvstore.sqlite import SqliteKVStoreImpl
+
+
+@pytest.fixture
+async def dist_registry(tmp_path):
+    db_path = tmp_path / "test_kv.db"
+    kvstore_config = SqliteKVStoreConfig(db_path=db_path.as_posix())
+    kvstore = SqliteKVStoreImpl(kvstore_config)
+    await kvstore.initialize()
+    registry = CachedDiskDistributionRegistry(kvstore)
+    await registry.initialize()
+    yield registry
+
+
+class Impl:
+    def __init__(self, api: Api):
+        self.api = api
+
+    @property
+    def __provider_spec__(self):
+        _provider_spec = AsyncMock()
+        _provider_spec.api = self.api
+        return _provider_spec
+
+
+class InferenceImpl(Impl):
+    def __init__(self):
+        super().__init__(Api.inference)
+
+    async def register_model(self, model: Model):
+        return model
+
+    async def unregister_model(self, model_id: str):
+        return model_id
+
+
+class SafetyImpl(Impl):
+    def __init__(self):
+        super().__init__(Api.safety)
+
+    async def register_shield(self, shield: Shield):
+        return shield
+
+
+class VectorDBImpl(Impl):
+    def __init__(self):
+        super().__init__(Api.vector_io)
+
+    async def register_vector_db(self, vector_db: VectorDB):
+        return vector_db
+
+    async def unregister_vector_db(self, vector_db_id: str):
+        return vector_db_id
+
+
+class DatasetsImpl(Impl):
+    def __init__(self):
+        super().__init__(Api.datasetio)
+
+    async def register_dataset(self, dataset: Dataset):
+        return dataset
+
+    async def unregister_dataset(self, dataset_id: str):
+        return dataset_id
+
+
+class ScoringFunctionsImpl(Impl):
+    def __init__(self):
+        super().__init__(Api.scoring)
+
+    async def list_scoring_functions(self):
+        return []
+
+    async def register_scoring_function(self, scoring_fn):
+        return scoring_fn
+
+
+class BenchmarksImpl(Impl):
+    def __init__(self):
+        super().__init__(Api.eval)
+
+    async def register_benchmark(self, benchmark):
+        return benchmark
+
+
+class ToolGroupsImpl(Impl):
+    def __init__(self):
+        super().__init__(Api.tool_runtime)
+
+    async def register_tool(self, tool):
+        return tool
+
+    async def unregister_tool(self, tool_name: str):
+        return tool_name
+
+    async def list_runtime_tools(self, toolgroup_id, mcp_endpoint):
+        return ListToolDefsResponse(
+            data=[
+                ToolDef(
+                    name="test-tool",
+                    description="Test tool",
+                    parameters=[ToolParameter(name="test-param", description="Test param", parameter_type="string")],
+                )
+            ]
+        )
+
+
+@pytest.mark.asyncio
+async def test_models_routing_table(dist_registry):
+    table = ModelsRoutingTable({"test_provider": InferenceImpl()}, dist_registry)
+    await table.initialize()
+
+    # Register multiple models and verify listing
+    await table.register_model(model_id="test-model", provider_id="test_provider")
+    await table.register_model(model_id="test-model-2", provider_id="test_provider")
+
+    models = await table.list_models()
+    assert len(models.data) == 2
+    model_ids = {m.identifier for m in models.data}
+    assert "test-model" in model_ids
+    assert "test-model-2" in model_ids
+
+    # Test openai list models
+    openai_models = await table.openai_list_models()
+    assert len(openai_models.data) == 2
+    openai_model_ids = {m.id for m in openai_models.data}
+    assert "test-model" in openai_model_ids
+    assert "test-model-2" in openai_model_ids
+
+    # Test get_object_by_identifier
+    model = await table.get_object_by_identifier("model", "test-model")
+    assert model is not None
+    assert model.identifier == "test-model"
+
+    # Test get_object_by_identifier on non-existent object
+    non_existent = await table.get_object_by_identifier("model", "non-existent-model")
+    assert non_existent is None
+
+    await table.unregister_model(model_id="test-model")
+    await table.unregister_model(model_id="test-model-2")
+
+    models = await table.list_models()
+    assert len(models.data) == 0
+
+    # Test openai list models
+    openai_models = await table.openai_list_models()
+    assert len(openai_models.data) == 0
+
+
+@pytest.mark.asyncio
+async def test_shields_routing_table(dist_registry):
+    table = ShieldsRoutingTable({"test_provider": SafetyImpl()}, dist_registry)
+    await table.initialize()
+
+    # Register multiple shields and verify listing
+    await table.register_shield(shield_id="test-shield", provider_id="test_provider")
+    await table.register_shield(shield_id="test-shield-2", provider_id="test_provider")
+    shields = await table.list_shields()
+
+    assert len(shields.data) == 2
+    shield_ids = {s.identifier for s in shields.data}
+    assert "test-shield" in shield_ids
+    assert "test-shield-2" in shield_ids
+
+
+@pytest.mark.asyncio
+async def test_vectordbs_routing_table(dist_registry):
+    table = VectorDBsRoutingTable({"test_provider": VectorDBImpl()}, dist_registry)
+    await table.initialize()
+
+    m_table = ModelsRoutingTable({"test_providere": InferenceImpl()}, dist_registry)
+    await m_table.initialize()
+    await m_table.register_model(
+        model_id="test-model",
+        provider_id="test_providere",
+        metadata={"embedding_dimension": 128},
+        model_type=ModelType.embedding,
+    )
+
+    # Register multiple vector databases and verify listing
+    await table.register_vector_db(vector_db_id="test-vectordb", embedding_model="test-model")
+    await table.register_vector_db(vector_db_id="test-vectordb-2", embedding_model="test-model")
+    vector_dbs = await table.list_vector_dbs()
+
+    assert len(vector_dbs.data) == 2
+    vector_db_ids = {v.identifier for v in vector_dbs.data}
+    assert "test-vectordb" in vector_db_ids
+    assert "test-vectordb-2" in vector_db_ids
+
+    await table.unregister_vector_db(vector_db_id="test-vectordb")
+    await table.unregister_vector_db(vector_db_id="test-vectordb-2")
+
+    vector_dbs = await table.list_vector_dbs()
+    assert len(vector_dbs.data) == 0
+
+
+async def test_datasets_routing_table(dist_registry):
+    table = DatasetsRoutingTable({"localfs": DatasetsImpl()}, dist_registry)
+    await table.initialize()
+
+    # Register multiple datasets and verify listing
+    await table.register_dataset(
+        dataset_id="test-dataset", purpose=DatasetPurpose.eval_messages_answer, source=URIDataSource(uri="test-uri")
+    )
+    await table.register_dataset(
+        dataset_id="test-dataset-2", purpose=DatasetPurpose.eval_messages_answer, source=URIDataSource(uri="test-uri-2")
+    )
+    datasets = await table.list_datasets()
+
+    assert len(datasets.data) == 2
+    dataset_ids = {d.identifier for d in datasets.data}
+    assert "test-dataset" in dataset_ids
+    assert "test-dataset-2" in dataset_ids
+
+    await table.unregister_dataset(dataset_id="test-dataset")
+    await table.unregister_dataset(dataset_id="test-dataset-2")
+
+    datasets = await table.list_datasets()
+    assert len(datasets.data) == 0
+
+
+@pytest.mark.asyncio
+async def test_scoring_functions_routing_table(dist_registry):
+    table = ScoringFunctionsRoutingTable({"test_provider": ScoringFunctionsImpl()}, dist_registry)
+    await table.initialize()
+
+    # Register multiple scoring functions and verify listing
+    await table.register_scoring_function(
+        scoring_fn_id="test-scoring-fn",
+        provider_id="test_provider",
+        description="Test scoring function",
+        return_type=NumberType(),
+    )
+    await table.register_scoring_function(
+        scoring_fn_id="test-scoring-fn-2",
+        provider_id="test_provider",
+        description="Another test scoring function",
+        return_type=NumberType(),
+    )
+    scoring_functions = await table.list_scoring_functions()
+
+    assert len(scoring_functions.data) == 2
+    scoring_fn_ids = {fn.identifier for fn in scoring_functions.data}
+    assert "test-scoring-fn" in scoring_fn_ids
+    assert "test-scoring-fn-2" in scoring_fn_ids
+
+
+@pytest.mark.asyncio
+async def test_benchmarks_routing_table(dist_registry):
+    table = BenchmarksRoutingTable({"test_provider": BenchmarksImpl()}, dist_registry)
+    await table.initialize()
+
+    # Register multiple benchmarks and verify listing
+    await table.register_benchmark(
+        benchmark_id="test-benchmark",
+        dataset_id="test-dataset",
+        scoring_functions=["test-scoring-fn", "test-scoring-fn-2"],
+    )
+    benchmarks = await table.list_benchmarks()
+
+    assert len(benchmarks.data) == 1
+    benchmark_ids = {b.identifier for b in benchmarks.data}
+    assert "test-benchmark" in benchmark_ids
+
+
+@pytest.mark.asyncio
+async def test_tool_groups_routing_table(dist_registry):
+    table = ToolGroupsRoutingTable({"test_provider": ToolGroupsImpl()}, dist_registry)
+    await table.initialize()
+
+    # Register multiple tool groups and verify listing
+    await table.register_tool_group(
+        toolgroup_id="test-toolgroup",
+        provider_id="test_provider",
+    )
+    tool_groups = await table.list_tool_groups()
+
+    assert len(tool_groups.data) == 1
+    tool_group_ids = {tg.identifier for tg in tool_groups.data}
+    assert "test-toolgroup" in tool_group_ids
+
+    await table.unregister_toolgroup(toolgroup_id="test-toolgroup")
+    tool_groups = await table.list_tool_groups()
+    assert len(tool_groups.data) == 0
--- a/tests/unit/registry/test_registry_acl.py
+++ b/tests/unit/registry/test_registry_acl.py
@ -12,7 +12,7 @@ import pytest

 from llama_stack.apis.models import ModelType
 from llama_stack.distribution.datatypes import ModelWithACL
-from llama_stack.distribution.server.auth import AccessAttributes
+from llama_stack.distribution.server.auth_providers import AccessAttributes
 from llama_stack.distribution.store.registry import CachedDiskDistributionRegistry
 from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
 from llama_stack.providers.utils.kvstore.sqlite import SqliteKVStoreImpl
--- a/tests/unit/server/test_auth.py
+++ b/tests/unit/server/test_auth.py
@ -10,7 +10,9 @@ import pytest
 from fastapi import FastAPI
 from fastapi.testclient import TestClient

+from llama_stack.distribution.datatypes import AccessAttributes
 from llama_stack.distribution.server.auth import AuthenticationMiddleware
+from llama_stack.distribution.server.auth_providers import AuthProviderConfig, AuthProviderType


 class MockResponse:
@ -38,9 +40,23 @@ def invalid_api_key():


@pytest.fixture
-def app(mock_auth_endpoint):
+def valid_token():
+    return "valid.jwt.token"
+
+
+@pytest.fixture
+def invalid_token():
+    return "invalid.jwt.token"
+
+
+@pytest.fixture
+def http_app(mock_auth_endpoint):
    app = FastAPI()
-    app.add_middleware(AuthenticationMiddleware, auth_endpoint=mock_auth_endpoint)
+    auth_config = AuthProviderConfig(
+        provider_type=AuthProviderType.CUSTOM,
+        config={"endpoint": mock_auth_endpoint},
+    )
+    app.add_middleware(AuthenticationMiddleware, auth_config=auth_config)

    @app.get("/test")
    def test_endpoint():
@ -50,8 +66,29 @@ def app(mock_auth_endpoint):


@pytest.fixture
-def client(app):
-    return TestClient(app)
+def k8s_app():
+    app = FastAPI()
+    auth_config = AuthProviderConfig(
+        provider_type=AuthProviderType.KUBERNETES,
+        config={"api_server_url": "https://kubernetes.default.svc"},
+    )
+    app.add_middleware(AuthenticationMiddleware, auth_config=auth_config)
+
+    @app.get("/test")
+    def test_endpoint():
+        return {"message": "Authentication successful"}
+
+    return app
+
+
+@pytest.fixture
+def http_client(http_app):
+    return TestClient(http_app)
+
+
+@pytest.fixture
+def k8s_client(k8s_app):
+    return TestClient(k8s_app)


@pytest.fixture
@ -61,7 +98,7 @@ def mock_scope():
        "path": "/models/list",
        "headers": [
            (b"content-type", b"application/json"),
-            (b"authorization", b"Bearer test-api-key"),
+            (b"authorization", b"Bearer test.jwt.token"),
            (b"user-agent", b"test-user-agent"),
        ],
        "query_string": b"limit=100&offset=0",
@ -69,13 +106,38 @@ def mock_scope():


@pytest.fixture
-def mock_middleware(mock_auth_endpoint):
+def mock_http_middleware(mock_auth_endpoint):
    mock_app = AsyncMock()
-    return AuthenticationMiddleware(mock_app, mock_auth_endpoint), mock_app
+    auth_config = AuthProviderConfig(
+        provider_type=AuthProviderType.CUSTOM,
+        config={"endpoint": mock_auth_endpoint},
+    )
+    return AuthenticationMiddleware(mock_app, auth_config), mock_app
+
+
+@pytest.fixture
+def mock_k8s_middleware():
+    mock_app = AsyncMock()
+    auth_config = AuthProviderConfig(
+        provider_type=AuthProviderType.KUBERNETES,
+        config={"api_server_url": "https://kubernetes.default.svc"},
+    )
+    return AuthenticationMiddleware(mock_app, auth_config), mock_app


 async def mock_post_success(*args, **kwargs):
-    return MockResponse(200, {"message": "Authentication successful"})
+    return MockResponse(
+        200,
+        {
+            "message": "Authentication successful",
+            "access_attributes": {
+                "roles": ["admin", "user"],
+                "teams": ["ml-team", "nlp-team"],
+                "projects": ["llama-3", "project-x"],
+                "namespaces": ["research", "production"],
+            },
+        },
+    )


 async def mock_post_failure(*args, **kwargs):
@ -86,45 +148,46 @@ async def mock_post_exception(*args, **kwargs):
    raise Exception("Connection error")


-def test_missing_auth_header(client):
-    response = client.get("/test")
+# HTTP Endpoint Tests
+def test_missing_auth_header(http_client):
+    response = http_client.get("/test")
    assert response.status_code == 401
    assert "Missing or invalid Authorization header" in response.json()["error"]["message"]


-def test_invalid_auth_header_format(client):
-    response = client.get("/test", headers={"Authorization": "InvalidFormat token123"})
+def test_invalid_auth_header_format(http_client):
+    response = http_client.get("/test", headers={"Authorization": "InvalidFormat token123"})
    assert response.status_code == 401
    assert "Missing or invalid Authorization header" in response.json()["error"]["message"]


@patch("httpx.AsyncClient.post", new=mock_post_success)
-def test_valid_authentication(client, valid_api_key):
-    response = client.get("/test", headers={"Authorization": f"Bearer {valid_api_key}"})
+def test_valid_http_authentication(http_client, valid_api_key):
+    response = http_client.get("/test", headers={"Authorization": f"Bearer {valid_api_key}"})
    assert response.status_code == 200
    assert response.json() == {"message": "Authentication successful"}


@patch("httpx.AsyncClient.post", new=mock_post_failure)
-def test_invalid_authentication(client, invalid_api_key):
-    response = client.get("/test", headers={"Authorization": f"Bearer {invalid_api_key}"})
+def test_invalid_http_authentication(http_client, invalid_api_key):
+    response = http_client.get("/test", headers={"Authorization": f"Bearer {invalid_api_key}"})
    assert response.status_code == 401
    assert "Authentication failed" in response.json()["error"]["message"]


@patch("httpx.AsyncClient.post", new=mock_post_exception)
-def test_auth_service_error(client, valid_api_key):
-    response = client.get("/test", headers={"Authorization": f"Bearer {valid_api_key}"})
+def test_http_auth_service_error(http_client, valid_api_key):
+    response = http_client.get("/test", headers={"Authorization": f"Bearer {valid_api_key}"})
    assert response.status_code == 401
    assert "Authentication service error" in response.json()["error"]["message"]


-def test_auth_request_payload(client, valid_api_key, mock_auth_endpoint):
+def test_http_auth_request_payload(http_client, valid_api_key, mock_auth_endpoint):
    with patch("httpx.AsyncClient.post") as mock_post:
        mock_response = MockResponse(200, {"message": "Authentication successful"})
        mock_post.return_value = mock_response

-        client.get(
+        http_client.get(
            "/test?param1=value1&param2=value2",
            headers={
                "Authorization": f"Bearer {valid_api_key}",
@ -149,40 +212,43 @@ def test_auth_request_payload(client, valid_api_key, mock_auth_endpoint):


@pytest.mark.asyncio
-async def test_auth_middleware_with_access_attributes(mock_middleware, mock_scope):
-    middleware, mock_app = mock_middleware
+async def test_http_middleware_with_access_attributes(mock_http_middleware, mock_scope):
+    """Test HTTP middleware behavior with access attributes"""
+    middleware, mock_app = mock_http_middleware
    mock_receive = AsyncMock()
    mock_send = AsyncMock()

-    with patch("httpx.AsyncClient") as mock_client:
-        mock_client_instance = AsyncMock()
-        mock_client.return_value.__aenter__.return_value = mock_client_instance
-
-        mock_client_instance.post.return_value = MockResponse(
+    with patch("httpx.AsyncClient.post") as mock_post:
+        mock_response = MockResponse(
            200,
            {
+                "message": "Authentication successful",
                "access_attributes": {
                    "roles": ["admin", "user"],
-                    "teams": ["ml-team"],
-                    "projects": ["project-x", "project-y"],
-                }
+                    "teams": ["ml-team", "nlp-team"],
+                    "projects": ["llama-3", "project-x"],
+                    "namespaces": ["research", "production"],
+                },
            },
        )
+        mock_post.return_value = mock_response

        await middleware(mock_scope, mock_receive, mock_send)

        assert "user_attributes" in mock_scope
-        assert mock_scope["user_attributes"]["roles"] == ["admin", "user"]
-        assert mock_scope["user_attributes"]["teams"] == ["ml-team"]
-        assert mock_scope["user_attributes"]["projects"] == ["project-x", "project-y"]
+        attributes = mock_scope["user_attributes"]
+        assert attributes["roles"] == ["admin", "user"]
+        assert attributes["teams"] == ["ml-team", "nlp-team"]
+        assert attributes["projects"] == ["llama-3", "project-x"]
+        assert attributes["namespaces"] == ["research", "production"]

        mock_app.assert_called_once_with(mock_scope, mock_receive, mock_send)


@pytest.mark.asyncio
-async def test_auth_middleware_no_attributes(mock_middleware, mock_scope):
+async def test_http_middleware_no_attributes(mock_http_middleware, mock_scope):
    """Test middleware behavior with no access attributes"""
-    middleware, mock_app = mock_middleware
+    middleware, mock_app = mock_http_middleware
    mock_receive = AsyncMock()
    mock_send = AsyncMock()

@ -203,4 +269,104 @@ async def test_auth_middleware_no_attributes(mock_middleware, mock_scope):
        assert "user_attributes" in mock_scope
        attributes = mock_scope["user_attributes"]
        assert "namespaces" in attributes
-        assert attributes["namespaces"] == ["test-api-key"]
+        assert attributes["namespaces"] == ["test.jwt.token"]
+
+
+# Kubernetes Tests
+def test_missing_auth_header_k8s(k8s_client):
+    response = k8s_client.get("/test")
+    assert response.status_code == 401
+    assert "Missing or invalid Authorization header" in response.json()["error"]["message"]
+
+
+def test_invalid_auth_header_format_k8s(k8s_client):
+    response = k8s_client.get("/test", headers={"Authorization": "InvalidFormat token123"})
+    assert response.status_code == 401
+    assert "Missing or invalid Authorization header" in response.json()["error"]["message"]
+
+
+@patch("kubernetes.client.ApiClient")
+def test_valid_k8s_authentication(mock_api_client, k8s_client, valid_token):
+    # Mock the Kubernetes client
+    mock_client = AsyncMock()
+    mock_api_client.return_value = mock_client
+
+    # Mock successful token validation
+    mock_client.set_default_header = AsyncMock()
+
+    # Mock the token validation to return valid access attributes
+    with patch("llama_stack.distribution.server.auth_providers.KubernetesAuthProvider.validate_token") as mock_validate:
+        mock_validate.return_value = AccessAttributes(
+            roles=["admin"], teams=["ml-team"], projects=["llama-3"], namespaces=["research"]
+        )
+        response = k8s_client.get("/test", headers={"Authorization": f"Bearer {valid_token}"})
+        assert response.status_code == 200
+        assert response.json() == {"message": "Authentication successful"}
+
+
+@patch("kubernetes.client.ApiClient")
+def test_invalid_k8s_authentication(mock_api_client, k8s_client, invalid_token):
+    # Mock the Kubernetes client
+    mock_client = AsyncMock()
+    mock_api_client.return_value = mock_client
+
+    # Mock failed token validation by raising an exception
+    with patch("llama_stack.distribution.server.auth_providers.KubernetesAuthProvider.validate_token") as mock_validate:
+        mock_validate.side_effect = ValueError("Invalid or expired token")
+        response = k8s_client.get("/test", headers={"Authorization": f"Bearer {invalid_token}"})
+        assert response.status_code == 401
+        assert "Invalid or expired token" in response.json()["error"]["message"]
+
+
+@pytest.mark.asyncio
+async def test_k8s_middleware_with_access_attributes(mock_k8s_middleware, mock_scope):
+    middleware, mock_app = mock_k8s_middleware
+    mock_receive = AsyncMock()
+    mock_send = AsyncMock()
+
+    with patch("kubernetes.client.ApiClient") as mock_api_client:
+        mock_client = AsyncMock()
+        mock_api_client.return_value = mock_client
+
+        # Mock successful token validation
+        mock_client.set_default_header = AsyncMock()
+
+        # Mock token payload with access attributes
+        mock_token_parts = ["header", "eyJzdWIiOiJhZG1pbiIsImdyb3VwcyI6WyJtbC10ZWFtIl19", "signature"]
+        mock_scope["headers"][1] = (b"authorization", f"Bearer {'.'.join(mock_token_parts)}".encode())
+
+        await middleware(mock_scope, mock_receive, mock_send)
+
+        assert "user_attributes" in mock_scope
+        assert mock_scope["user_attributes"]["roles"] == ["admin"]
+        assert mock_scope["user_attributes"]["teams"] == ["ml-team"]
+
+        mock_app.assert_called_once_with(mock_scope, mock_receive, mock_send)
+
+
+@pytest.mark.asyncio
+async def test_k8s_middleware_no_attributes(mock_k8s_middleware, mock_scope):
+    """Test middleware behavior with no access attributes"""
+    middleware, mock_app = mock_k8s_middleware
+    mock_receive = AsyncMock()
+    mock_send = AsyncMock()
+
+    with patch("kubernetes.client.ApiClient") as mock_api_client:
+        mock_client = AsyncMock()
+        mock_api_client.return_value = mock_client
+
+        # Mock successful token validation
+        mock_client.set_default_header = AsyncMock()
+
+        # Mock token payload without access attributes
+        mock_token_parts = ["header", "eyJzdWIiOiJhZG1pbiJ9", "signature"]
+        mock_scope["headers"][1] = (b"authorization", f"Bearer {'.'.join(mock_token_parts)}".encode())
+
+        await middleware(mock_scope, mock_receive, mock_send)
+
+        assert "user_attributes" in mock_scope
+        attributes = mock_scope["user_attributes"]
+        assert "roles" in attributes
+        assert attributes["roles"] == ["admin"]
+
+        mock_app.assert_called_once_with(mock_scope, mock_receive, mock_send)
--- a/tests/verifications/conf/fireworks-llama-stack.yaml
+++ b/tests/verifications/conf/fireworks-llama-stack.yaml
@ -13,3 +13,5 @@ test_exclusions:
  - test_chat_non_streaming_image
  - test_chat_streaming_image
  - test_chat_multi_turn_multiple_images
+  - test_response_non_streaming_image
+  - test_response_non_streaming_multi_turn_image
--- a/tests/verifications/conf/groq-llama-stack.yaml
+++ b/tests/verifications/conf/groq-llama-stack.yaml
@ -13,3 +13,5 @@ test_exclusions:
  - test_chat_non_streaming_image
  - test_chat_streaming_image
  - test_chat_multi_turn_multiple_images
+  - test_response_non_streaming_image
+  - test_response_non_streaming_multi_turn_image
--- a/tests/verifications/conf/together-llama-stack.yaml
+++ b/tests/verifications/conf/together-llama-stack.yaml
@ -13,3 +13,5 @@ test_exclusions:
  - test_chat_non_streaming_image
  - test_chat_streaming_image
  - test_chat_multi_turn_multiple_images
+  - test_response_non_streaming_image
+  - test_response_non_streaming_multi_turn_image
--- a/tests/verifications/generate_report.py
+++ b/tests/verifications/generate_report.py
@ -16,7 +16,7 @@ Description:


 Configuration:
-    - Provider details (models, display names) are loaded from `tests/verifications/config.yaml`.
+    - Provider details (models, display names) are loaded from `tests/verifications/conf/*.yaml`.
    - Test cases are defined in YAML files within `tests/verifications/openai_api/fixtures/test_cases/`.
    - Test results are stored in `tests/verifications/test_results/`.

--- a/tests/verifications/openai-api-verification-run.yaml
+++ b/tests/verifications/openai-api-verification-run.yaml
@ -1,10 +1,15 @@
+# This is a temporary run file because model names used by the verification tests
+# are not quite consistent with various pre-existing distributions.
+#
 version: '2'
 image_name: openai-api-verification
 apis:
+- agents
 - inference
 - telemetry
 - tool_runtime
 - vector_io
+- safety
 providers:
  inference:
  - provider_id: together
@ -16,12 +21,12 @@ providers:
    provider_type: remote::fireworks
    config:
      url: https://api.fireworks.ai/inference/v1
-      api_key: ${env.FIREWORKS_API_KEY}
+      api_key: ${env.FIREWORKS_API_KEY:}
  - provider_id: groq
    provider_type: remote::groq
    config:
      url: https://api.groq.com
-      api_key: ${env.GROQ_API_KEY}
+      api_key: ${env.GROQ_API_KEY:}
  - provider_id: openai
    provider_type: remote::openai
    config:
@ -44,7 +49,20 @@ providers:
    config:
      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/openai/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/openai-api-verification}/trace_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/openai}/agents_store.db
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
--- a/tests/verifications/openai_api/conftest.py
+++ b/tests/verifications/openai_api/conftest.py
@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from tests.verifications.openai_api.fixtures.fixtures import _load_all_verification_configs
+
+
+def pytest_generate_tests(metafunc):
+    """Dynamically parametrize tests based on the selected provider and config."""
+    if "model" in metafunc.fixturenames:
+        provider = metafunc.config.getoption("provider")
+        if not provider:
+            print("Warning: --provider not specified. Skipping model parametrization.")
+            metafunc.parametrize("model", [])
+            return
+
+        try:
+            config_data = _load_all_verification_configs()
+        except (FileNotFoundError, IOError) as e:
+            print(f"ERROR loading verification configs: {e}")
+            config_data = {"providers": {}}
+
+        provider_config = config_data.get("providers", {}).get(provider)
+        if provider_config:
+            models = provider_config.get("models", [])
+            if models:
+                metafunc.parametrize("model", models)
+            else:
+                print(f"Warning: No models found for provider '{provider}' in config.")
+                metafunc.parametrize("model", [])  # Parametrize empty if no models found
+        else:
+            print(f"Warning: Provider '{provider}' not found in config. No models parametrized.")
+            metafunc.parametrize("model", [])  # Parametrize empty if provider not found
--- a/tests/verifications/openai_api/fixtures/fixtures.py
+++ b/tests/verifications/openai_api/fixtures/fixtures.py
@ -5,14 +5,16 @@
 # the root directory of this source tree.

 import os
+import re
 from pathlib import Path

 import pytest
 import yaml
 from openai import OpenAI

+# --- Helper Functions ---
+

-# --- Helper Function to Load Config ---
 def _load_all_verification_configs():
    """Load and aggregate verification configs from the conf/ directory."""
    # Note: Path is relative to *this* file (fixtures.py)
@ -44,7 +46,30 @@ def _load_all_verification_configs():
    return {"providers": all_provider_configs}


-# --- End Helper Function ---
+def case_id_generator(case):
+    """Generate a test ID from the case's 'case_id' field, or use a default."""
+    case_id = case.get("case_id")
+    if isinstance(case_id, (str, int)):
+        return re.sub(r"\\W|^(?=\\d)", "_", str(case_id))
+    return None
+
+
+def should_skip_test(verification_config, provider, model, test_name_base):
+    """Check if a test should be skipped based on config exclusions."""
+    provider_config = verification_config.get("providers", {}).get(provider)
+    if not provider_config:
+        return False  # No config for provider, don't skip
+
+    exclusions = provider_config.get("test_exclusions", {}).get(model, [])
+    return test_name_base in exclusions
+
+
+# Helper to get the base test name from the request object
+def get_base_test_name(request):
+    return request.node.originalname
+
+
+# --- End Helper Functions ---


@pytest.fixture(scope="session")
--- a/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
+++ b/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
@ -0,0 +1,65 @@
+test_response_basic:
+  test_name: test_response_basic
+  test_params:
+    case:
+    - case_id: "earth"
+      input: "Which planet do humans live on?"
+      output: "earth"
+    - case_id: "saturn"
+      input: "Which planet has rings around it with a name starting with letter S?"
+      output: "saturn"
+
+test_response_multi_turn:
+  test_name: test_response_multi_turn
+  test_params:
+    case:
+    - case_id: "earth"
+      turns:
+      - input: "Which planet do humans live on?"
+        output: "earth"
+      - input: "What is the name of the planet from your previous response?"
+        output: "earth"
+
+test_response_web_search:
+  test_name: test_response_web_search
+  test_params:
+    case:
+    - case_id: "llama_experts"
+      input: "How many experts does the Llama 4 Maverick model have?"
+      tools:
+      - type: web_search
+        search_context_size: "low"
+      output: "128"
+
+test_response_image:
+  test_name: test_response_image
+  test_params:
+    case:
+    - case_id: "llama_image"
+      input:
+      - role: user
+        content:
+        - type: input_text
+          text: "Identify the type of animal in this image."
+        - type: input_image
+          image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
+      output: "llama"
+
+test_response_multi_turn_image:
+  test_name: test_response_multi_turn_image
+  test_params:
+    case:
+    - case_id: "llama_image_search"
+      turns:
+      - input:
+        - role: user
+          content:
+          - type: input_text
+            text: "What type of animal is in this image? Please respond with a single word that starts with the letter 'L'."
+          - type: input_image
+            image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
+        output: "llama"
+      - input: "Search the web using the search tool for the animal from the previous response. Your search query should be a single phrase that includes the animal's name and the words 'maverick' and 'scout'."
+        tools:
+        - type: web_search
+        output: "model"
--- a/tests/verifications/openai_api/test_chat_completion.py
+++ b/tests/verifications/openai_api/test_chat_completion.py
@ -7,7 +7,6 @@
 import base64
 import copy
 import json
-import re
 from pathlib import Path
 from typing import Any

@ -16,7 +15,9 @@ from openai import APIError
 from pydantic import BaseModel

 from tests.verifications.openai_api.fixtures.fixtures import (
-    _load_all_verification_configs,
+    case_id_generator,
+    get_base_test_name,
+    should_skip_test,
 )
 from tests.verifications.openai_api.fixtures.load import load_test_cases

@ -25,57 +26,6 @@ chat_completion_test_cases = load_test_cases("chat_completion")
 THIS_DIR = Path(__file__).parent


-def case_id_generator(case):
-    """Generate a test ID from the case's 'case_id' field, or use a default."""
-    case_id = case.get("case_id")
-    if isinstance(case_id, (str, int)):
-        return re.sub(r"\\W|^(?=\\d)", "_", str(case_id))
-    return None
-
-
-def pytest_generate_tests(metafunc):
-    """Dynamically parametrize tests based on the selected provider and config."""
-    if "model" in metafunc.fixturenames:
-        provider = metafunc.config.getoption("provider")
-        if not provider:
-            print("Warning: --provider not specified. Skipping model parametrization.")
-            metafunc.parametrize("model", [])
-            return
-
-        try:
-            config_data = _load_all_verification_configs()
-        except (FileNotFoundError, IOError) as e:
-            print(f"ERROR loading verification configs: {e}")
-            config_data = {"providers": {}}
-
-        provider_config = config_data.get("providers", {}).get(provider)
-        if provider_config:
-            models = provider_config.get("models", [])
-            if models:
-                metafunc.parametrize("model", models)
-            else:
-                print(f"Warning: No models found for provider '{provider}' in config.")
-                metafunc.parametrize("model", [])  # Parametrize empty if no models found
-        else:
-            print(f"Warning: Provider '{provider}' not found in config. No models parametrized.")
-            metafunc.parametrize("model", [])  # Parametrize empty if provider not found
-
-
-def should_skip_test(verification_config, provider, model, test_name_base):
-    """Check if a test should be skipped based on config exclusions."""
-    provider_config = verification_config.get("providers", {}).get(provider)
-    if not provider_config:
-        return False  # No config for provider, don't skip
-
-    exclusions = provider_config.get("test_exclusions", {}).get(model, [])
-    return test_name_base in exclusions
-
-
-# Helper to get the base test name from the request object
-def get_base_test_name(request):
-    return request.node.originalname
-
-
@pytest.fixture
 def multi_image_data():
    files = [
--- a/tests/verifications/openai_api/test_responses.py
+++ b/tests/verifications/openai_api/test_responses.py
@ -0,0 +1,166 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+import pytest
+
+from tests.verifications.openai_api.fixtures.fixtures import (
+    case_id_generator,
+    get_base_test_name,
+    should_skip_test,
+)
+from tests.verifications.openai_api.fixtures.load import load_test_cases
+
+responses_test_cases = load_test_cases("responses")
+
+
+@pytest.mark.parametrize(
+    "case",
+    responses_test_cases["test_response_basic"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_response_non_streaming_basic(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    response = openai_client.responses.create(
+        model=model,
+        input=case["input"],
+        stream=False,
+    )
+    output_text = response.output_text.lower().strip()
+    assert len(output_text) > 0
+    assert case["output"].lower() in output_text
+
+    retrieved_response = openai_client.responses.retrieve(response_id=response.id)
+    assert retrieved_response.output_text == response.output_text
+
+    next_response = openai_client.responses.create(
+        model=model, input="Repeat your previous response in all caps.", previous_response_id=response.id
+    )
+    next_output_text = next_response.output_text.strip()
+    assert case["output"].upper() in next_output_text
+
+
+@pytest.mark.parametrize(
+    "case",
+    responses_test_cases["test_response_basic"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_response_streaming_basic(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    response = openai_client.responses.create(
+        model=model,
+        input=case["input"],
+        stream=True,
+    )
+    streamed_content = []
+    response_id = ""
+    for chunk in response:
+        if chunk.type == "response.completed":
+            response_id = chunk.response.id
+            streamed_content.append(chunk.response.output_text.strip())
+
+    assert len(streamed_content) > 0
+    assert case["output"].lower() in "".join(streamed_content).lower()
+
+    retrieved_response = openai_client.responses.retrieve(response_id=response_id)
+    assert retrieved_response.output_text == "".join(streamed_content)
+
+
+@pytest.mark.parametrize(
+    "case",
+    responses_test_cases["test_response_multi_turn"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_response_non_streaming_multi_turn(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    previous_response_id = None
+    for turn in case["turns"]:
+        response = openai_client.responses.create(
+            model=model,
+            input=turn["input"],
+            previous_response_id=previous_response_id,
+            tools=turn["tools"] if "tools" in turn else None,
+        )
+        previous_response_id = response.id
+        output_text = response.output_text.lower()
+        assert turn["output"].lower() in output_text
+
+
+@pytest.mark.parametrize(
+    "case",
+    responses_test_cases["test_response_web_search"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_response_non_streaming_web_search(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    response = openai_client.responses.create(
+        model=model,
+        input=case["input"],
+        tools=case["tools"],
+        stream=False,
+    )
+    assert len(response.output) > 1
+    assert response.output[0].type == "web_search_call"
+    assert response.output[0].status == "completed"
+    assert response.output[1].type == "message"
+    assert response.output[1].status == "completed"
+    assert response.output[1].role == "assistant"
+    assert len(response.output[1].content) > 0
+    assert case["output"].lower() in response.output_text.lower().strip()
+
+
+@pytest.mark.parametrize(
+    "case",
+    responses_test_cases["test_response_image"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_response_non_streaming_image(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    response = openai_client.responses.create(
+        model=model,
+        input=case["input"],
+        stream=False,
+    )
+    output_text = response.output_text.lower()
+    assert case["output"].lower() in output_text
+
+
+@pytest.mark.parametrize(
+    "case",
+    responses_test_cases["test_response_multi_turn_image"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_response_non_streaming_multi_turn_image(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    previous_response_id = None
+    for turn in case["turns"]:
+        response = openai_client.responses.create(
+            model=model,
+            input=turn["input"],
+            previous_response_id=previous_response_id,
+            tools=turn["tools"] if "tools" in turn else None,
+        )
+        previous_response_id = response.id
+        output_text = response.output_text.lower()
+        assert turn["output"].lower() in output_text
--- a/uv.lock
+++ b/uv.lock
@ -1,5 +1,4 @@
 version = 1
-revision = 1
 requires-python = ">=3.10"
 resolution-markers = [
    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
@ -676,6 +675,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408 },
 ]

+[[package]]
+name = "durationpy"
+version = "0.9"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/31/e9/f49c4e7fccb77fa5c43c2480e09a857a78b41e7331a75e128ed5df45c56b/durationpy-0.9.tar.gz", hash = "sha256:fd3feb0a69a0057d582ef643c355c40d2fa1c942191f914d12203b1a01ac722a", size = 3186 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4c/a3/ac312faeceffd2d8f86bc6dcb5c401188ba5a01bc88e69bed97578a0dfcd/durationpy-0.9-py3-none-any.whl", hash = "sha256:e65359a7af5cedad07fb77a2dd3f390f8eb0b74cb845589fa6c057086834dd38", size = 3461 },
+]
+
 [[package]]
 name = "exceptiongroup"
 version = "1.2.2"
@ -842,6 +850,20 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/1d/9a/4114a9057db2f1462d5c8f8390ab7383925fe1ac012eaa42402ad65c2963/GitPython-3.1.44-py3-none-any.whl", hash = "sha256:9e0e10cda9bed1ee64bc9a6de50e7e38a9c9943241cd7f585f6df3ed28011110", size = 207599 },
 ]

+[[package]]
+name = "google-auth"
+version = "2.38.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cachetools" },
+    { name = "pyasn1-modules" },
+    { name = "rsa" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c6/eb/d504ba1daf190af6b204a9d4714d457462b486043744901a6eeea711f913/google_auth-2.38.0.tar.gz", hash = "sha256:8285113607d3b80a3f1543b75962447ba8a09fe85783432a784fdeef6ac094c4", size = 270866 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9d/47/603554949a37bca5b7f894d51896a9c534b9eab808e2520a748e081669d0/google_auth-2.38.0-py2.py3-none-any.whl", hash = "sha256:e7dae6694313f434a2727bf2906f27ad259bae090d7aa896590d86feec3d9d4a", size = 210770 },
+]
+
 [[package]]
 name = "googleapis-common-protos"
 version = "1.67.0"
@ -1289,6 +1311,28 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/c9/fb/108ecd1fe961941959ad0ee4e12ee7b8b1477247f30b1fdfd83ceaf017f0/jupyter_core-5.7.2-py3-none-any.whl", hash = "sha256:4f7315d2f6b4bcf2e3e7cb6e46772eba760ae459cd1f59d29eb57b0a01bd7409", size = 28965 },
 ]

+[[package]]
+name = "kubernetes"
+version = "32.0.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "durationpy" },
+    { name = "google-auth" },
+    { name = "oauthlib" },
+    { name = "python-dateutil" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "requests-oauthlib" },
+    { name = "six" },
+    { name = "urllib3" },
+    { name = "websocket-client" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b7/e8/0598f0e8b4af37cd9b10d8b87386cf3173cb8045d834ab5f6ec347a758b3/kubernetes-32.0.1.tar.gz", hash = "sha256:42f43d49abd437ada79a79a16bd48a604d3471a117a8347e87db693f2ba0ba28", size = 946691 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/08/10/9f8af3e6f569685ce3af7faab51c8dd9d93b9c38eba339ca31c746119447/kubernetes-32.0.1-py2.py3-none-any.whl", hash = "sha256:35282ab8493b938b08ab5526c7ce66588232df00ef5e1dbe88a419107dc10998", size = 1988070 },
+]
+
 [[package]]
 name = "levenshtein"
 version = "0.27.1"
@ -1374,7 +1418,7 @@ wheels = [

 [[package]]
 name = "llama-stack"
-version = "0.2.1"
+version = "0.2.4"
 source = { editable = "." }
 dependencies = [
    { name = "blobfile" },
@ -1384,6 +1428,7 @@ dependencies = [
    { name = "huggingface-hub" },
    { name = "jinja2" },
    { name = "jsonschema" },
+    { name = "kubernetes" },
    { name = "llama-stack-client" },
    { name = "openai" },
    { name = "pillow" },
@ -1485,8 +1530,9 @@ requires-dist = [
    { name = "jinja2", specifier = ">=3.1.6" },
    { name = "jinja2", marker = "extra == 'codegen'", specifier = ">=3.1.6" },
    { name = "jsonschema" },
-    { name = "llama-stack-client", specifier = ">=0.2.2" },
-    { name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.1" },
+    { name = "kubernetes" },
+    { name = "llama-stack-client", specifier = ">=0.2.4" },
+    { name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.4" },
    { name = "mcp", marker = "extra == 'test'" },
    { name = "myst-parser", marker = "extra == 'docs'" },
    { name = "nbval", marker = "extra == 'dev'" },
@ -1538,11 +1584,10 @@ requires-dist = [
    { name = "types-setuptools", marker = "extra == 'dev'" },
    { name = "uvicorn", marker = "extra == 'dev'" },
 ]
-provides-extras = ["dev", "unit", "test", "docs", "codegen", "ui"]

 [[package]]
 name = "llama-stack-client"
-version = "0.2.2"
+version = "0.2.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "anyio" },
@ -1559,9 +1604,9 @@ dependencies = [
    { name = "tqdm" },
    { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/fc/1c/7d3ab0e57195f21f9cf121fba2692ee8dc792793e5c82aa702602dda9bea/llama_stack_client-0.2.2.tar.gz", hash = "sha256:a0323b18b9f68172c639755652654452b7e72e28e77d95db5146e25d83002d34", size = 241914 }
+sdist = { url = "https://files.pythonhosted.org/packages/d9/bd/bbbac1a766f33f947bd105338a2a469ef3a9faef78da20436f3f5d0adc95/llama_stack_client-0.2.4.tar.gz", hash = "sha256:51df03c7172739c37c222fb25072ee5f1f2943037d1e23336eb7c2408a294825", size = 254328 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/9e/68/bdd9cb19e2c151d9aa8bf91444dfa9675bc7913006d8e1e030fb79dbf8c5/llama_stack_client-0.2.2-py3-none-any.whl", hash = "sha256:2a4ef3edb861e9a3a734e6e5e65d9d3de1f10cd56c18d21d82253088d2758e53", size = 273307 },
+    { url = "https://files.pythonhosted.org/packages/14/85/9f8bf39a9201be82d32e1cdb03629b552bcc94bb3348e0f154c0e20a2c43/llama_stack_client-0.2.4-py3-none-any.whl", hash = "sha256:7541c6179e9afd5a1a94eed4d151a76d10869e3bde2506b16a9bdb52fc0a7a84", size = 292723 },
 ]

 [[package]]
@ -2022,6 +2067,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/17/7f/d322a4125405920401450118dbdc52e0384026bd669939484670ce8b2ab9/numpy-2.2.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:783145835458e60fa97afac25d511d00a1eca94d4a8f3ace9fe2043003c678e4", size = 12839607 },
 ]

+[[package]]
+name = "oauthlib"
+version = "3.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6d/fa/fbf4001037904031639e6bfbfc02badfc7e12f137a8afa254df6c4c8a670/oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918", size = 177352 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/80/cab10959dc1faead58dc8384a781dfbf93cb4d33d50988f7a69f1b7c9bbe/oauthlib-3.2.2-py3-none-any.whl", hash = "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca", size = 151688 },
+]
+
 [[package]]
 name = "openai"
 version = "1.71.0"
@ -2525,6 +2579,27 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/ed/bd/54907846383dcc7ee28772d7e646f6c34276a17da740002a5cefe90f04f7/pyarrow-19.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:58d9397b2e273ef76264b45531e9d552d8ec8a6688b7390b5be44c02a37aade8", size = 42085744 },
 ]

+[[package]]
+name = "pyasn1"
+version = "0.6.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ba/e9/01f1a64245b89f039897cb0130016d79f77d52669aae6ee7b159a6c4c018/pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034", size = 145322 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629", size = 83135 },
+]
+
+[[package]]
+name = "pyasn1-modules"
+version = "0.4.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyasn1" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e9/e6/78ebbb10a8c8e4b61a59249394a4a594c1a7af95593dc933a349c8d00964/pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6", size = 307892 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259 },
+]
+
 [[package]]
 name = "pycparser"
 version = "2.22"
@ -3135,6 +3210,19 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 },
 ]

+[[package]]
+name = "requests-oauthlib"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "oauthlib" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/42/f2/05f29bc3913aea15eb670be136045bf5c5bbf4b99ecb839da9b422bb2c85/requests-oauthlib-2.0.0.tar.gz", hash = "sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9", size = 55650 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36", size = 24179 },
+]
+
 [[package]]
 name = "rich"
 version = "13.9.4"
@ -3234,6 +3322,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/9f/2e/c5c1689e80298d4e94c75b70faada4c25445739d91b94c211244a3ed7ed1/rpds_py-0.22.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:177c7c0fce2855833819c98e43c262007f42ce86651ffbb84f37883308cb0e7d", size = 233338 },
 ]

+[[package]]
+name = "rsa"
+version = "4.9"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyasn1" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/aa/65/7d973b89c4d2351d7fb232c2e452547ddfa243e93131e7cfa766da627b52/rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21", size = 29711 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/49/97/fa78e3d2f65c02c8e1268b9aba606569fe97f6c8f7c2d74394553347c145/rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7", size = 34315 },
+]
+
 [[package]]
 name = "ruamel-yaml"
 version = "0.18.10"
@ -4109,6 +4209,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166 },
 ]

+[[package]]
+name = "websocket-client"
+version = "1.8.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e6/30/fba0d96b4b5fbf5948ed3f4681f7da2f9f64512e1d303f94b4cc174c24a5/websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da", size = 54648 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5a/84/44687a29792a70e111c5c477230a72c4b957d88d16141199bf9acb7537a3/websocket_client-1.8.0-py3-none-any.whl", hash = "sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526", size = 58826 },
+]
+
 [[package]]
 name = "websockets"
 version = "15.0"