Merge branch 'main' into fix/vector-db-mandatory-provider-id

2025-10-04 12:07:34 +00:00 · 2025-09-11 12:02:37 +03:00 · 2025-09-11 12:02:37 +03:00 · 4374da02f3
commit 4374da02f3
parent e6a5ad5e35 8e05c68d15
243 changed files with 21774 additions and 17408 deletions
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -2,26 +2,28 @@ name: 'Run and Record Tests'
 description: 'Run integration tests and handle recording/artifact upload'
 inputs:
  test-subdirs:
    description: 'Comma-separated list of test subdirectories to run'
    required: true
  test-pattern:
    description: 'Regex pattern to pass to pytest -k'
    required: false
    default: ''
  stack-config:
    description: 'Stack configuration to use'
    required: true
-  provider:
+  setup:
-    description: 'Provider to use for tests'
+    description: 'Setup to use for tests (e.g., ollama, gpt, vllm)'
-    required: true
+    required: false
    default: ''
  inference-mode:
    description: 'Inference mode (record or replay)'
    required: true
-  run-vision-tests:
+  suite:
-    description: 'Whether to run vision tests'
+    description: 'Test suite to use: base, responses, vision, etc.'
    required: false
-    default: 'false'
+    default: ''
  subdirs:
    description: 'Comma-separated list of test subdirectories to run; overrides suite'
    required: false
    default: ''
  pattern:
    description: 'Regex pattern to pass to pytest -k'
    required: false
    default: ''
 runs:
  using: 'composite'
@ -36,14 +38,23 @@ runs:
    - name: Run Integration Tests
      shell: bash
      run: |
-        uv run --no-sync ./scripts/integration-tests.sh \
+        SCRIPT_ARGS="--stack-config ${{ inputs.stack-config }} --inference-mode ${{ inputs.inference-mode }}"
-          --stack-config '${{ inputs.stack-config }}' \
+
-          --provider '${{ inputs.provider }}' \
+        # Add optional arguments only if they are provided
-          --test-subdirs '${{ inputs.test-subdirs }}' \
+        if [ -n '${{ inputs.setup }}' ]; then
-          --test-pattern '${{ inputs.test-pattern }}' \
+          SCRIPT_ARGS="$SCRIPT_ARGS --setup ${{ inputs.setup }}"
-          --inference-mode '${{ inputs.inference-mode }}' \
+        fi
-          ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} \
+        if [ -n '${{ inputs.suite }}' ]; then
-          | tee pytest-${{ inputs.inference-mode }}.log
+          SCRIPT_ARGS="$SCRIPT_ARGS --suite ${{ inputs.suite }}"
        fi
        if [ -n '${{ inputs.subdirs }}' ]; then
          SCRIPT_ARGS="$SCRIPT_ARGS --subdirs ${{ inputs.subdirs }}"
        fi
        if [ -n '${{ inputs.pattern }}' ]; then
          SCRIPT_ARGS="$SCRIPT_ARGS --pattern ${{ inputs.pattern }}"
        fi
        uv run --no-sync ./scripts/integration-tests.sh $SCRIPT_ARGS | tee pytest-${{ inputs.inference-mode }}.log
    - name: Commit and push recordings
@ -57,12 +68,7 @@ runs:
          echo "New recordings detected, committing and pushing"
          git add tests/integration/recordings/
-          if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
+          git commit -m "Recordings update from CI (suite: ${{ inputs.suite }})"
            git commit -m "Recordings update from CI (vision)"
          else
            git commit -m "Recordings update from CI"
          fi
          git fetch origin ${{ github.ref_name }}
          git rebase origin/${{ github.ref_name }}
          echo "Rebased successfully"
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@ -1,17 +1,17 @@
 name: Setup Ollama
 description: Start Ollama
 inputs:
-  run-vision-tests:
+  suite:
-    description: 'Run vision tests: "true" or "false"'
+    description: 'Test suite to use: base, responses, vision, etc.'
    required: false
-    default: 'false'
+    default: ''
 runs:
  using: "composite"
  steps:
    - name: Start Ollama
      shell: bash
      run: |
-        if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
+        if [ "${{ inputs.suite }}" == "vision" ]; then
          image="ollama-with-vision-model"
        else
          image="ollama-with-models"
--- a/.github/actions/setup-test-environment/action.yml
+++ b/.github/actions/setup-test-environment/action.yml
@ -8,14 +8,14 @@ inputs:
  client-version:
    description: 'Client version (latest or published)'
    required: true
-  provider:
+  setup:
-    description: 'Provider to setup (ollama or vllm)'
+    description: 'Setup to configure (ollama, vllm, gpt, etc.)'
    required: true
    default: 'ollama'
  run-vision-tests:
    description: 'Whether to setup provider for vision tests'
    required: false
-    default: 'false'
+    default: 'ollama'
  suite:
    description: 'Test suite to use: base, responses, vision, etc.'
    required: false
    default: ''
  inference-mode:
    description: 'Inference mode (record or replay)'
    required: true
@ -30,13 +30,13 @@ runs:
        client-version: ${{ inputs.client-version }}
    - name: Setup ollama
-      if: ${{ inputs.provider == 'ollama' && inputs.inference-mode == 'record' }}
+      if: ${{ (inputs.setup == 'ollama' || inputs.setup == 'ollama-vision') && inputs.inference-mode == 'record' }}
      uses: ./.github/actions/setup-ollama
      with:
-        run-vision-tests: ${{ inputs.run-vision-tests }}
+        suite: ${{ inputs.suite }}
    - name: Setup vllm
-      if: ${{ inputs.provider == 'vllm' && inputs.inference-mode == 'record' }}
+      if: ${{ inputs.setup == 'vllm' && inputs.inference-mode == 'record' }}
      uses: ./.github/actions/setup-vllm
    - name: Build Llama Stack
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -5,10 +5,11 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Name | File | Purpose |
 | ---- | ---- | ------- |
 | Update Changelog | [changelog.yml](changelog.yml) | Creates PR for updating the CHANGELOG.md |
 | API Conformance Tests | [conformance.yml](conformance.yml) | Run the API Conformance test suite on the changes. |
 | Installer CI | [install-script-ci.yml](install-script-ci.yml) | Test the installation script |
 | Integration Auth Tests | [integration-auth-tests.yml](integration-auth-tests.yml) | Run the integration test suite with Kubernetes authentication |
 | SqlStore Integration Tests | [integration-sql-store-tests.yml](integration-sql-store-tests.yml) | Run the integration test suite with SqlStore |
-| Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suite from tests/integration in replay mode |
+| Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suites from tests/integration in replay mode |
 | Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers |
 | Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
 | Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
--- a/.github/workflows/conformance.yml
+++ b/.github/workflows/conformance.yml
@ -0,0 +1,57 @@
 # API Conformance Tests
 # This workflow ensures that API changes maintain backward compatibility and don't break existing integrations
 # It runs schema validation and OpenAPI diff checks to catch breaking changes early
 name: API Conformance Tests
 run-name: Run the API Conformance test suite on the changes.
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
    types: [opened, synchronize, reopened]
    paths:
      - 'llama_stack/**'
      - '!llama_stack/ui/**'
      - 'tests/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - '.github/workflows/conformance.yml' # This workflow itself
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
  # Cancel in-progress runs when new commits are pushed to avoid wasting CI resources
  cancel-in-progress: true
 jobs:
  # Job to check if API schema changes maintain backward compatibility
  check-schema-compatibility:
    runs-on: ubuntu-latest
    steps:
      # Using specific version 4.1.7 because 5.0.0 fails when trying to run this locally using `act`
      # This ensures consistent behavior between local testing and CI
      - name: Checkout PR Code
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      # Checkout the base branch to compare against (usually main)
      # This allows us to diff the current changes against the previous state
      - name: Checkout Base Branch
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          ref: ${{ github.event.pull_request.base.ref }}
          path: 'base'
      # Install oasdiff: https://github.com/oasdiff/oasdiff, a tool for detecting breaking changes in OpenAPI specs.
      - name: Install oasdiff
        run: |
          curl -fsSL https://raw.githubusercontent.com/oasdiff/oasdiff/main/install.sh | sh
      # Run oasdiff to detect breaking changes in the API specification
      # This step will fail if incompatible changes are detected, preventing breaking changes from being merged
      - name: Run OpenAPI Breaking Change Diff
        run: |
          oasdiff breaking --fail-on ERR base/docs/_static/llama-stack-spec.yaml docs/_static/llama-stack-spec.yaml --match-path '^/v1/openai/v1' \
          --match-path '^/v1/vector-io' \
          --match-path '^/v1/vector-dbs'
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -1,6 +1,6 @@
 name: Integration Tests (Replay)
-run-name: Run the integration test suite from tests/integration in replay mode
+run-name: Run the integration test suites from tests/integration in replay mode
 on:
  push:
@ -28,18 +28,10 @@ on:
        description: 'Test against both the latest and published versions'
        type: boolean
        default: false
-      test-provider:
+      test-setup:
-        description: 'Test against a specific provider'
+        description: 'Test against a specific setup'
        type: string
        default: 'ollama'
      test-subdirs:
        description: 'Comma-separated list of test subdirectories to run'
        type: string
        default: ''
      test-pattern:
        description: 'Regex pattern to pass to pytest -k'
        type: string
        default: ''
 concurrency:
  # Skip concurrency for pushes to main - each commit should be tested independently
@ -50,18 +42,18 @@ jobs:
  run-replay-mode-tests:
    runs-on: ubuntu-latest
-    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}
+    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.setup, matrix.python-version, matrix.client-version, matrix.suite) }}
    strategy:
      fail-fast: false
      matrix:
        client-type: [library, server]
-        # Use vllm on weekly schedule, otherwise use test-provider input (defaults to ollama)
+        # Use vllm on weekly schedule, otherwise use test-setup input (defaults to ollama)
-        provider: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-provider || 'ollama')) }}
+        setup: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-setup || 'ollama')) }}
        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
-        run-vision-tests: [true, false]
+        suite: [base, vision]
    steps:
      - name: Checkout repository
@ -72,16 +64,14 @@ jobs:
        with:
          python-version: ${{ matrix.python-version }}
          client-version: ${{ matrix.client-version }}
-          provider: ${{ matrix.provider }}
+          setup: ${{ matrix.setup }}
-          run-vision-tests: ${{ matrix.run-vision-tests }}
+          suite: ${{ matrix.suite }}
          inference-mode: 'replay'
      - name: Run tests
        uses: ./.github/actions/run-and-record-tests
        with:
          test-subdirs: ${{ inputs.test-subdirs }}
          test-pattern: ${{ inputs.test-pattern }}
          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
-          provider: ${{ matrix.provider }}
+          setup: ${{ matrix.setup }}
          inference-mode: 'replay'
-          run-vision-tests: ${{ matrix.run-vision-tests }}
+          suite: ${{ matrix.suite }}
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -28,7 +28,7 @@ jobs:
          fetch-depth: ${{ github.actor == 'dependabot[bot]' && 0 || 1 }}
      - name: Set up Python
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
        with:
          python-version: '3.12'
          cache: pip
@ -37,7 +37,7 @@ jobs:
            .pre-commit-config.yaml
      - name: Set up Node.js
-        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
+        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
        with:
          node-version: '20'
          cache: 'npm'
@ -48,7 +48,6 @@ jobs:
        working-directory: llama_stack/ui
      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
        continue-on-error: true
        env:
          SKIP: no-commit-to-branch
          RUFF_OUTPUT_FORMAT: github
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -24,7 +24,7 @@ jobs:
      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
    - name: Install uv
-      uses: astral-sh/setup-uv@4959332f0f014c5280e7eac8b70c90cb574c9f9b # v6.6.0
+      uses: astral-sh/setup-uv@557e51de59eb14aaaba2ed9621916900a91d50c6 # v6.6.1
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
--- a/.github/workflows/record-integration-tests.yml
+++ b/.github/workflows/record-integration-tests.yml
@ -10,19 +10,19 @@ run-name: Run the integration test suite from tests/integration
 on:
  workflow_dispatch:
    inputs:
-      test-subdirs:
+      test-setup:
-        description: 'Comma-separated list of test subdirectories to run'
+        description: 'Test against a specific setup'
        type: string
        default: ''
      test-provider:
        description: 'Test against a specific provider'
        type: string
        default: 'ollama'
-      run-vision-tests:
+      suite:
-        description: 'Whether to run vision tests'
+        description: 'Test suite to use: base, responses, vision, etc.'
-        type: boolean
+        type: string
-        default: false
+        default: ''
-      test-pattern:
+      subdirs:
        description: 'Comma-separated list of test subdirectories to run; overrides suite'
        type: string
        default: ''
      pattern:
        description: 'Regex pattern to pass to pytest -k'
        type: string
        default: ''
@ -38,11 +38,11 @@ jobs:
      - name: Echo workflow inputs
        run: |
          echo "::group::Workflow Inputs"
          echo "test-subdirs: ${{ inputs.test-subdirs }}"
          echo "test-provider: ${{ inputs.test-provider }}"
          echo "run-vision-tests: ${{ inputs.run-vision-tests }}"
          echo "test-pattern: ${{ inputs.test-pattern }}"
          echo "branch: ${{ github.ref_name }}"
          echo "test-setup: ${{ inputs.test-setup }}"
          echo "suite: ${{ inputs.suite }}"
          echo "subdirs: ${{ inputs.subdirs }}"
          echo "pattern: ${{ inputs.pattern }}"
          echo "::endgroup::"
      - name: Checkout repository
@ -55,16 +55,16 @@ jobs:
        with:
          python-version: "3.12"  # Use single Python version for recording
          client-version: "latest"
-          provider: ${{ inputs.test-provider || 'ollama' }}
+          setup: ${{ inputs.test-setup || 'ollama' }}
-          run-vision-tests: ${{ inputs.run-vision-tests }}
+          suite: ${{ inputs.suite }}
          inference-mode: 'record'
      - name: Run and record tests
        uses: ./.github/actions/run-and-record-tests
        with:
          test-pattern: ${{ inputs.test-pattern }}
          test-subdirs: ${{ inputs.test-subdirs }}
          stack-config: 'server:ci-tests'  # recording must be done with server since more tests are run
-          provider: ${{ inputs.test-provider || 'ollama' }}
+          setup: ${{ inputs.test-setup || 'ollama' }}
          inference-mode: 'record'
-          run-vision-tests: ${{ inputs.run-vision-tests }}
+          suite: ${{ inputs.suite }}
          subdirs: ${{ inputs.subdirs }}
          pattern: ${{ inputs.pattern }}
--- a/.github/workflows/stale_bot.yml
+++ b/.github/workflows/stale_bot.yml
@ -24,7 +24,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Stale Action
-        uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
+        uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
        with:
          stale-issue-label: 'stale'
          stale-issue-message: >
--- a/.github/workflows/ui-unit-tests.yml
+++ b/.github/workflows/ui-unit-tests.yml
@ -29,7 +29,7 @@ jobs:
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Setup Node.js
-        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
+        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
        with:
          node-version: ${{ matrix.node-version }}
          cache: 'npm'
--- a/.gitignore
+++ b/.gitignore
@ -26,5 +26,7 @@ venv/
 pytest-report.xml
 .coverage
 .python-version
 AGENTS.md
 server.log
 CLAUDE.md
 .claude/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -86,7 +86,7 @@ repos:
        language: python
        pass_filenames: false
        require_serial: true
-        files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
+        files: ^llama_stack/distributions/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
      - id: provider-codegen
        name: Provider Codegen
        additional_dependencies:
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,103 @@
 # Changelog
 # v0.2.20
 Published on: 2025-08-29T22:25:32Z
 Here are some key changes that are coming as part of this release.
 ### Build and Environment
 - Environment improvements: fixed env var replacement to preserve types.
 - Docker stability: fixed container startup failures for Fireworks AI provider.
 - Removed absolute paths in build for better portability.
 ### Features
 - UI Enhancements: Implemented file upload and VectorDB creation/configuration directly in UI.
 - Vector Store Improvements: Added keyword, vector, and hybrid search inside vector store.
 - Added S3 authorization support for file providers.
 - SQL Store: Added inequality support to where clause.
 ### Documentation
 - Fixed post-training docs.
 - Added Contributor Guidelines for creating Internal vs. External providers.
 ### Fixes
 - Removed unsupported bfcl scoring function.
 - Multiple reliability and configuration fixes for providers and environment handling.
 ### Engineering / Chores
 - Cleaner internal development setup with consistent paths.
 - Incremental improvements to provider integration and vector store behavior.
 ### New Contributors
 - @omertuc made their first contribution in #3270
 - @r3v5 made their first contribution in vector store hybrid search
 ---
 # v0.2.19
 Published on: 2025-08-26T22:06:55Z
 ## Highlights
 * feat: Add CORS configuration support for server by @skamenan7 in https://github.com/llamastack/llama-stack/pull/3201
 * feat(api): introduce /rerank by @ehhuang in https://github.com/llamastack/llama-stack/pull/2940
 * feat: Add S3 Files Provider by @mattf in https://github.com/llamastack/llama-stack/pull/3202
 ---
 # v0.2.18
 Published on: 2025-08-20T01:09:27Z
 ## Highlights
 * Add moderations create API
 * Hybrid search in Milvus
 * Numerous Responses API improvements
 * Documentation updates
 ---
 # v0.2.17
 Published on: 2025-08-05T01:51:14Z
 ## Highlights
 * feat(tests): introduce inference record/replay to increase test reliability by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2941
 * fix(library_client): improve initialization error handling and prevent AttributeError by @mattf in https://github.com/meta-llama/llama-stack/pull/2944
 * fix: use OLLAMA_URL to activate Ollama provider in starter by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2963
 * feat(UI): adding MVP playground UI by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/2828
 * Standardization of errors (@nathan-weinberg)
 * feat: Enable DPO training with HuggingFace inline provider by @Nehanth in https://github.com/meta-llama/llama-stack/pull/2825
 * chore: rename templates to distributions by @ashwinb in https://github.com/meta-llama/llama-stack/pull/3035
 ---
 # v0.2.16
 Published on: 2025-07-28T23:35:23Z
 ## Highlights
 * Automatic model registration for self-hosted providers (ollama and vllm currently). No need for `INFERENCE_MODEL` environment variables which need to be updated, etc.
 * Much simplified starter distribution. Most `ENABLE_` env variables are now gone. When you set `VLLM_URL`, the `vllm` provider is auto-enabled. Similar for `MILVUS_URL`, `PGVECTOR_DB`, etc. Check the [run.yaml](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/templates/starter/run.yaml) for more details.
 * All tests migrated to pytest now (thanks @Elbehery)
 * DPO implementation in the post-training provider (thanks @Nehanth)
 * (Huge!) Support for external APIs and providers thereof (thanks @leseb, @cdoern and others). This is a really big deal -- you can now add more APIs completely out of tree and experiment with them before (optionally) wanting to contribute back.
 * `inline::vllm` provider is gone thank you very much
 * several improvements to OpenAI inference implementations and LiteLLM backend (thanks @mattf)
 * Chroma now supports Vector Store API (thanks @franciscojavierarceo).
 * Authorization improvements: Vector Store/File APIs now supports access control (thanks @franciscojavierarceo); Telemetry read APIs are gated according to logged-in user's roles.
 ---
 # v0.2.15
 Published on: 2025-07-16T03:30:01Z
--- a/docs/source/distributions/k8s-benchmark/README.md
+++ b/docs/source/distributions/k8s-benchmark/README.md
@ -34,13 +34,12 @@ This data enables data-driven architectural decisions and performance optimizati
 **1. Deploy base k8s infrastructure:**
 ```bash
-cd ../k8s
+cd ../../docs/source/distributions/k8s
 ./apply.sh
 ```
 **2. Deploy benchmark components:**
 ```bash
 cd ../k8s-benchmark
 ./apply.sh
 ```
@ -56,7 +55,6 @@ kubectl get pods
 **Benchmark Llama Stack (default):**
 ```bash
 cd docs/source/distributions/k8s-benchmark/
 ./run-benchmark.sh
 ```
--- a/docs/source/distributions/k8s-benchmark/apply.sh
+++ b/docs/source/distributions/k8s-benchmark/apply.sh
--- a/docs/source/distributions/k8s-benchmark/benchmark.py
+++ b/docs/source/distributions/k8s-benchmark/benchmark.py
@ -14,7 +14,7 @@ import os
 import random
 import statistics
 import time
-from typing import Tuple
+
 import aiohttp
@ -57,17 +57,9 @@ class BenchmarkStats:
        success_rate = (self.success_count / self.total_requests) * 100
        print(f"\n{'=' * 60}")
-        print(f"BENCHMARK RESULTS")
+        print("BENCHMARK RESULTS")
        print(f"{'='*60}")
        print(f"Total time: {total_time:.2f}s")
        print(f"Concurrent users: {self.concurrent_users}")
        print(f"Total requests: {self.total_requests}")
        print(f"Successful requests: {self.success_count}")
        print(f"Failed requests: {len(self.errors)}")
        print(f"Success rate: {success_rate:.1f}%")
        print(f"Requests per second: {self.success_count / total_time:.2f}")
-        print(f"\nResponse Time Statistics:")
+        print("\nResponse Time Statistics:")
        print(f"  Mean: {statistics.mean(self.response_times):.3f}s")
        print(f"  Median: {statistics.median(self.response_times):.3f}s")
        print(f"  Min: {min(self.response_times):.3f}s")
@ -78,14 +70,14 @@ class BenchmarkStats:
        percentiles = [50, 90, 95, 99]
        sorted_times = sorted(self.response_times)
-        print(f"\nPercentiles:")
+        print("\nPercentiles:")
        for p in percentiles:
            idx = int(len(sorted_times) * p / 100) - 1
            idx = max(0, min(idx, len(sorted_times) - 1))
            print(f"  P{p}: {sorted_times[idx]:.3f}s")
        if self.ttft_times:
-            print(f"\nTime to First Token (TTFT) Statistics:")
+            print("\nTime to First Token (TTFT) Statistics:")
            print(f"  Mean: {statistics.mean(self.ttft_times):.3f}s")
            print(f"  Median: {statistics.median(self.ttft_times):.3f}s")
            print(f"  Min: {min(self.ttft_times):.3f}s")
@ -95,26 +87,35 @@ class BenchmarkStats:
                print(f"  Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
            sorted_ttft = sorted(self.ttft_times)
-            print(f"\nTTFT Percentiles:")
+            print("\nTTFT Percentiles:")
            for p in percentiles:
                idx = int(len(sorted_ttft) * p / 100) - 1
                idx = max(0, min(idx, len(sorted_ttft) - 1))
                print(f"  P{p}: {sorted_ttft[idx]:.3f}s")
        if self.chunks_received:
-            print(f"\nStreaming Statistics:")
+            print("\nStreaming Statistics:")
            print(f"  Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
            print(f"  Total chunks received: {sum(self.chunks_received)}")
        print(f"{'=' * 60}")
        print(f"Total time: {total_time:.2f}s")
        print(f"Concurrent users: {self.concurrent_users}")
        print(f"Total requests: {self.total_requests}")
        print(f"Successful requests: {self.success_count}")
        print(f"Failed requests: {len(self.errors)}")
        print(f"Success rate: {success_rate:.1f}%")
        print(f"Requests per second: {self.success_count / total_time:.2f}")
        if self.errors:
-            print(f"\nErrors (showing first 5):")
+            print("\nErrors (showing first 5):")
            for error in self.errors[:5]:
                print(f"  {error}")
 class LlamaStackBenchmark:
    def __init__(self, base_url: str, model_id: str):
-        self.base_url = base_url.rstrip('/')
+        self.base_url = base_url.rstrip("/")
        self.model_id = model_id
        self.headers = {"Content-Type": "application/json"}
        self.test_messages = [
@ -125,20 +126,14 @@ class LlamaStackBenchmark:
            [
                {"role": "user", "content": "What is machine learning?"},
                {"role": "assistant", "content": "Machine learning is a subset of AI..."},
-                {"role": "user", "content": "Can you give me a practical example?"}
+                {"role": "user", "content": "Can you give me a practical example?"},
-            ]
+            ],
        ]
-
+    async def make_async_streaming_request(self) -> tuple[float, int, float | None, str | None]:
    async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
        """Make a single async streaming chat completion request."""
        messages = random.choice(self.test_messages)
-        payload = {
+        payload = {"model": self.model_id, "messages": messages, "stream": True, "max_tokens": 100}
            "model": self.model_id,
            "messages": messages,
            "stream": True,
            "max_tokens": 100
        }
        start_time = time.time()
        chunks_received = 0
@ -152,17 +147,17 @@ class LlamaStackBenchmark:
                f"{self.base_url}/chat/completions",
                headers=self.headers,
                json=payload,
-                timeout=aiohttp.ClientTimeout(total=30)
+                timeout=aiohttp.ClientTimeout(total=30),
            ) as response:
                if response.status == 200:
                    async for line in response.content:
                        if line:
-                            line_str = line.decode('utf-8').strip()
+                            line_str = line.decode("utf-8").strip()
-                            if line_str.startswith('data: '):
+                            if line_str.startswith("data: "):
                                chunks_received += 1
                                if ttft is None:
                                    ttft = time.time() - start_time
-                                if line_str == 'data: [DONE]':
+                                if line_str == "data: [DONE]":
                                    break
                    if chunks_received == 0:
@ -179,7 +174,6 @@ class LlamaStackBenchmark:
        response_time = time.time() - start_time
        return response_time, chunks_received, ttft, error
    async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
        """Run benchmark using async requests for specified duration."""
        stats = BenchmarkStats()
@ -191,7 +185,7 @@ class LlamaStackBenchmark:
        print(f"Model: {self.model_id}")
        connector = aiohttp.TCPConnector(limit=concurrent_users)
-        async with aiohttp.ClientSession(connector=connector) as session:
+        async with aiohttp.ClientSession(connector=connector):
            async def worker(worker_id: int):
                """Worker that sends requests sequentially until canceled."""
@ -215,7 +209,9 @@ class LlamaStackBenchmark:
                        await asyncio.sleep(1)  # Report every second
                        if time.time() >= last_report_time + 10:  # Report every 10 seconds
                            elapsed = time.time() - stats.start_time
-                            print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s")
+                            print(
                                f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}"
                            )
                            last_report_time = time.time()
                    except asyncio.CancelledError:
                        break
@ -240,14 +236,16 @@ class LlamaStackBenchmark:
 def main():
    parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
-    parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
+    parser.add_argument(
-                       help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)")
+        "--base-url",
-    parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"),
+        default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
-                       help="Model ID to use for requests")
+        help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)",
-    parser.add_argument("--duration", type=int, default=60,
+    )
-                       help="Duration in seconds to run benchmark (default: 60)")
+    parser.add_argument(
-    parser.add_argument("--concurrent", type=int, default=10,
+        "--model", default=os.getenv("INFERENCE_MODEL", "test-model"), help="Model ID to use for requests"
-                       help="Number of concurrent users (default: 10)")
+    )
    parser.add_argument("--duration", type=int, default=60, help="Duration in seconds to run benchmark (default: 60)")
    parser.add_argument("--concurrent", type=int, default=10, help="Number of concurrent users (default: 10)")
    args = parser.parse_args()
--- a/docs/source/distributions/k8s-benchmark/openai-mock-server.py
+++ b/docs/source/distributions/k8s-benchmark/openai-mock-server.py
@ -11,16 +11,18 @@ OpenAI-compatible mock server that returns:
 - Valid OpenAI-formatted chat completion responses with dynamic content
 """
 from flask import Flask, request, jsonify, Response
 import time
 import random
 import uuid
 import json
 import argparse
 import json
 import os
 import random
 import time
 import uuid
 from flask import Flask, Response, jsonify, request
 app = Flask(__name__)
 # Models from environment variables
 def get_models():
    models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
@ -29,40 +31,72 @@ def get_models():
    return {
        "object": "list",
        "data": [
-            {
+            {"id": model_id, "object": "model", "created": 1234567890, "owned_by": "vllm"} for model_id in model_ids
-                "id": model_id,
+        ],
                "object": "model",
                "created": 1234567890,
                "owned_by": "vllm"
            }
            for model_id in model_ids
        ]
    }
 def generate_random_text(length=50):
    """Generate random but coherent text for responses."""
    words = [
-        "Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you",
+        "Hello",
-        "with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what",
+        "there",
-        "you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist",
+        "I'm",
-        "with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more"
+        "an",
        "AI",
        "assistant",
        "ready",
        "to",
        "help",
        "you",
        "with",
        "your",
        "questions",
        "and",
        "tasks",
        "today",
        "Let",
        "me",
        "know",
        "what",
        "you'd",
        "like",
        "to",
        "discuss",
        "or",
        "explore",
        "together",
        "I",
        "can",
        "assist",
        "with",
        "various",
        "topics",
        "including",
        "coding",
        "writing",
        "analysis",
        "and",
        "more",
    ]
    return " ".join(random.choices(words, k=length))
-@app.route('/v1/models', methods=['GET'])
+
@app.route("/v1/models", methods=["GET"])
 def list_models():
    models = get_models()
    print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
    return jsonify(models)
-@app.route('/v1/chat/completions', methods=['POST'])
+
@app.route("/v1/chat/completions", methods=["POST"])
 def chat_completions():
    """Return OpenAI-formatted chat completion responses."""
    data = request.get_json()
-    default_model = get_models()['data'][0]['id']
+    default_model = get_models()["data"][0]["id"]
-    model = data.get('model', default_model)
+    model = data.get("model", default_model)
-    messages = data.get('messages', [])
+    messages = data.get("messages", [])
-    stream = data.get('stream', False)
+    stream = data.get("stream", False)
    print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
@ -71,11 +105,12 @@ def chat_completions():
    else:
        return handle_non_streaming_completion(model, messages)
 def handle_non_streaming_completion(model, messages):
    response_text = generate_random_text(random.randint(20, 80))
    # Calculate realistic token counts
-    prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages)
+    prompt_tokens = sum(len(str(msg.get("content", "")).split()) for msg in messages)
    completion_tokens = len(response_text.split())
    response = {
@ -83,25 +118,17 @@ def handle_non_streaming_completion(model, messages):
        "object": "chat.completion",
        "created": int(time.time()),
        "model": model,
-        "choices": [
+        "choices": [{"index": 0, "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop"}],
            {
                "index": 0,
                "message": {
                    "role": "assistant",
                    "content": response_text
                },
                "finish_reason": "stop"
            }
        ],
        "usage": {
            "prompt_tokens": prompt_tokens,
            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens
+            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        },
    }
    return jsonify(response)
 def handle_streaming_completion(model, messages):
    def generate_stream():
        # Generate response text
@ -114,12 +141,7 @@ def handle_streaming_completion(model, messages):
            "object": "chat.completion.chunk",
            "created": int(time.time()),
            "model": model,
-            "choices": [
+            "choices": [{"index": 0, "delta": {"role": "assistant", "content": ""}}],
                {
                    "index": 0,
                    "delta": {"role": "assistant", "content": ""}
                }
            ]
        }
        yield f"data: {json.dumps(initial_chunk)}\n\n"
@ -130,12 +152,7 @@ def handle_streaming_completion(model, messages):
                "object": "chat.completion.chunk",
                "created": int(time.time()),
                "model": model,
-                "choices": [
+                "choices": [{"index": 0, "delta": {"content": f"{word} " if i < len(words) - 1 else word}}],
                    {
                        "index": 0,
                        "delta": {"content": f"{word} " if i < len(words) - 1 else word}
                    }
                ]
            }
            yield f"data: {json.dumps(chunk)}\n\n"
            # Configurable delay to simulate realistic streaming
@ -148,35 +165,30 @@ def handle_streaming_completion(model, messages):
            "object": "chat.completion.chunk",
            "created": int(time.time()),
            "model": model,
-            "choices": [
+            "choices": [{"index": 0, "delta": {"content": ""}, "finish_reason": "stop"}],
                {
                    "index": 0,
                    "delta": {"content": ""},
                    "finish_reason": "stop"
                }
            ]
        }
        yield f"data: {json.dumps(final_chunk)}\n\n"
        yield "data: [DONE]\n\n"
    return Response(
        generate_stream(),
-        mimetype='text/event-stream',
+        mimetype="text/event-stream",
        headers={
-            'Cache-Control': 'no-cache',
+            "Cache-Control": "no-cache",
-            'Connection': 'keep-alive',
+            "Connection": "keep-alive",
-            'Access-Control-Allow-Origin': '*',
+            "Access-Control-Allow-Origin": "*",
-        }
+        },
    )
-@app.route('/health', methods=['GET'])
+
@app.route("/health", methods=["GET"])
 def health():
    return jsonify({"status": "healthy", "type": "openai-mock"})
-if __name__ == '__main__':
+
-    parser = argparse.ArgumentParser(description='OpenAI-compatible mock server')
+if __name__ == "__main__":
-    parser.add_argument('--port', type=int, default=8081, 
+    parser = argparse.ArgumentParser(description="OpenAI-compatible mock server")
-                       help='Port to run the server on (default: 8081)')
+    parser.add_argument("--port", type=int, default=8081, help="Port to run the server on (default: 8081)")
    args = parser.parse_args()
    port = args.port
@ -187,4 +199,4 @@ if __name__ == '__main__':
    print("- OpenAI-formatted chat/completion responses with dynamic content")
    print("- Streaming support with valid SSE format")
    print(f"- Listening on: http://0.0.0.0:{port}")
-    app.run(host='0.0.0.0', port=port, debug=False)
+    app.run(host="0.0.0.0", port=port, debug=False)
--- a/docs/source/distributions/k8s-benchmark/profile_running_server.sh
+++ b/docs/source/distributions/k8s-benchmark/profile_running_server.sh
--- a/docs/source/distributions/k8s-benchmark/run-benchmark.sh
+++ b/docs/source/distributions/k8s-benchmark/run-benchmark.sh
--- a/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
@ -6,6 +6,7 @@ data:
    apis:
    - agents
    - inference
    - files
    - safety
    - telemetry
    - tool_runtime
@ -19,13 +20,6 @@ data:
          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
          api_token: ${env.VLLM_API_TOKEN:=fake}
          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
      - provider_id: vllm-safety
        provider_type: remote::vllm
        config:
          url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
          api_token: ${env.VLLM_API_TOKEN:=fake}
          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
      - provider_id: sentence-transformers
        provider_type: inline::sentence-transformers
        config: {}
@ -41,6 +35,14 @@ data:
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
      files:
      - provider_id: meta-reference-files
        provider_type: inline::localfs
        config:
          storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
          metadata_store:
            type: sqlite
            db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
      safety:
      - provider_id: llama-guard
        provider_type: inline::llama-guard
@ -111,9 +113,6 @@ data:
    - model_id: ${env.INFERENCE_MODEL}
      provider_id: vllm-inference
      model_type: llm
    - model_id: ${env.SAFETY_MODEL}
      provider_id: vllm-safety
      model_type: llm
    shields:
    - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
    vector_dbs: []
--- a/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template
--- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
@ -2,7 +2,10 @@ version: '2'
 image_name: kubernetes-benchmark-demo
 apis:
 - agents
 - files
 - inference
 - files
 - safety
 - telemetry
 - tool_runtime
 - vector_io
@ -18,6 +21,14 @@ providers:
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
    config: {}
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
      metadata_store:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
  vector_io:
  - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
    provider_type: remote::chromadb
@ -30,6 +41,19 @@ providers:
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
      metadata_store:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config:
      excluded_categories: []
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
@ -95,6 +119,8 @@ models:
 - model_id: ${env.INFERENCE_MODEL}
  provider_id: vllm-inference
  model_type: llm
 shields:
 - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
 vector_dbs: []
 datasets: []
 scoring_fns: []
--- a/docs/_static/css/my_theme.css
+++ b/docs/_static/css/my_theme.css
@ -1,5 +1,106 @@
@import url("theme.css");
 /* Horizontal Navigation Bar */
 .horizontal-nav {
    background-color: #ffffff;
    border-bottom: 1px solid #e5e5e5;
    padding: 0;
    position: fixed;
    top: 0;
    left: 0;
    right: 0;
    z-index: 1050;
    height: 50px;
    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
 }
 [data-theme="dark"] .horizontal-nav {
    background-color: #1a1a1a;
    border-bottom: 1px solid #333;
 }
 .horizontal-nav .nav-container {
    max-width: 1200px;
    margin: 0 auto;
    display: flex;
    align-items: center;
    justify-content: space-between;
    padding: 0 20px;
    height: 100%;
 }
 .horizontal-nav .nav-brand {
    font-size: 18px;
    font-weight: 600;
    color: #333;
    text-decoration: none;
 }
 [data-theme="dark"] .horizontal-nav .nav-brand {
    color: #fff;
 }
 .horizontal-nav .nav-links {
    display: flex;
    align-items: center;
    gap: 30px;
    list-style: none;
    margin: 0;
    padding: 0;
 }
 .horizontal-nav .nav-links a {
    color: #666;
    text-decoration: none;
    font-size: 14px;
    font-weight: 500;
    padding: 8px 12px;
    border-radius: 6px;
    transition: all 0.2s ease;
 }
 .horizontal-nav .nav-links a:hover,
 .horizontal-nav .nav-links a.active {
    color: #333;
    background-color: #f5f5f5;
 }
 .horizontal-nav .nav-links a.active {
    font-weight: 600;
 }
 [data-theme="dark"] .horizontal-nav .nav-links a {
    color: #ccc;
 }
 [data-theme="dark"] .horizontal-nav .nav-links a:hover,
 [data-theme="dark"] .horizontal-nav .nav-links a.active {
    color: #fff;
    background-color: #333;
 }
 .horizontal-nav .nav-links .github-link {
    display: flex;
    align-items: center;
    gap: 6px;
 }
 .horizontal-nav .nav-links .github-icon {
    width: 16px;
    height: 16px;
    fill: currentColor;
 }
 /* Adjust main content to account for fixed nav */
 .wy-nav-side {
    top: 50px;
    height: calc(100vh - 50px);
 }
 .wy-nav-content-wrap {
    margin-top: 50px;
 }
 .wy-nav-content {
    max-width: 90%;
 }
--- a/docs/_static/js/horizontal_nav.js
+++ b/docs/_static/js/horizontal_nav.js
@ -0,0 +1,44 @@
 // Horizontal Navigation Bar for Llama Stack Documentation
 document.addEventListener('DOMContentLoaded', function() {
    // Create the horizontal navigation HTML
    const navHTML = `
        <nav class="horizontal-nav">
            <div class="nav-container">
                <a href="/" class="nav-brand">Llama Stack</a>
                <ul class="nav-links">
                    <li><a href="/">Docs</a></li>
                    <li><a href="/references/api_reference/">API Reference</a></li>
                    <li><a href="https://github.com/meta-llama/llama-stack" target="_blank" class="github-link">
                        <svg class="github-icon" viewBox="0 0 16 16" aria-hidden="true">
                            <path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"/>
                        </svg>
                        GitHub
                    </a></li>
                </ul>
            </div>
        </nav>
    `;
    // Insert the navigation at the beginning of the body
    document.body.insertAdjacentHTML('afterbegin', navHTML);
    // Update navigation links based on current page
    updateActiveNav();
 });
 function updateActiveNav() {
    const currentPath = window.location.pathname;
    const navLinks = document.querySelectorAll('.horizontal-nav .nav-links a');
    navLinks.forEach(link => {
        // Remove any existing active classes
        link.classList.remove('active');
        // Add active class based on current path
        if (currentPath === '/' && link.getAttribute('href') === '/') {
            link.classList.add('active');
        } else if (currentPath.includes('/references/api_reference/') && link.getAttribute('href').includes('api_reference')) {
            link.classList.add('active');
        }
    });
 }
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -633,6 +633,80 @@
                }
            }
        },
        "/v1/prompts": {
            "get": {
                "responses": {
                    "200": {
                        "description": "A ListPromptsResponse containing all prompts.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/ListPromptsResponse"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Prompts"
                ],
                "description": "List all prompts.",
                "parameters": []
            },
            "post": {
                "responses": {
                    "200": {
                        "description": "The created Prompt resource.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/Prompt"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Prompts"
                ],
                "description": "Create a new prompt.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/CreatePromptRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/agents/{agent_id}": {
            "get": {
                "responses": {
@ -901,6 +975,143 @@
                ]
            }
        },
        "/v1/prompts/{prompt_id}": {
            "get": {
                "responses": {
                    "200": {
                        "description": "A Prompt resource.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/Prompt"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Prompts"
                ],
                "description": "Get a prompt by its identifier and optional version.",
                "parameters": [
                    {
                        "name": "prompt_id",
                        "in": "path",
                        "description": "The identifier of the prompt to get.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    },
                    {
                        "name": "version",
                        "in": "query",
                        "description": "The version of the prompt to get (defaults to latest).",
                        "required": false,
                        "schema": {
                            "type": "integer"
                        }
                    }
                ]
            },
            "post": {
                "responses": {
                    "200": {
                        "description": "The updated Prompt resource with incremented version.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/Prompt"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Prompts"
                ],
                "description": "Update an existing prompt (increments version).",
                "parameters": [
                    {
                        "name": "prompt_id",
                        "in": "path",
                        "description": "The identifier of the prompt to update.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/UpdatePromptRequest"
                            }
                        }
                    },
                    "required": true
                }
            },
            "delete": {
                "responses": {
                    "200": {
                        "description": "OK"
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Prompts"
                ],
                "description": "Delete a prompt.",
                "parameters": [
                    {
                        "name": "prompt_id",
                        "in": "path",
                        "description": "The identifier of the prompt to delete.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ]
            }
        },
        "/v1/inference/embeddings": {
            "post": {
                "responses": {
@ -2836,6 +3047,49 @@
                ]
            }
        },
        "/v1/prompts/{prompt_id}/versions": {
            "get": {
                "responses": {
                    "200": {
                        "description": "A ListPromptsResponse containing all versions of the prompt.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/ListPromptsResponse"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Prompts"
                ],
                "description": "List all versions of a specific prompt.",
                "parameters": [
                    {
                        "name": "prompt_id",
                        "in": "path",
                        "description": "The identifier of the prompt to list versions for.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ]
            }
        },
        "/v1/providers": {
            "get": {
                "responses": {
@ -5007,6 +5261,59 @@
                }
            }
        },
        "/v1/prompts/{prompt_id}/set-default-version": {
            "post": {
                "responses": {
                    "200": {
                        "description": "The prompt with the specified version now set as default.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/Prompt"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Prompts"
                ],
                "description": "Set which version of a prompt should be the default in get_prompt (latest).",
                "parameters": [
                    {
                        "name": "prompt_id",
                        "in": "path",
                        "description": "The identifier of the prompt.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/SetDefaultVersionRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/post-training/supervised-fine-tune": {
            "post": {
                "responses": {
@ -9670,6 +9977,65 @@
                ],
                "title": "OpenAIResponseObjectStreamResponseWebSearchCallSearching"
            },
            "CreatePromptRequest": {
                "type": "object",
                "properties": {
                    "prompt": {
                        "type": "string",
                        "description": "The prompt text content with variable placeholders."
                    },
                    "variables": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        },
                        "description": "List of variable names that can be used in the prompt template."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "prompt"
                ],
                "title": "CreatePromptRequest"
            },
            "Prompt": {
                "type": "object",
                "properties": {
                    "prompt": {
                        "type": "string",
                        "description": "The system prompt text with variable placeholders. Variables are only supported when using the Responses API."
                    },
                    "version": {
                        "type": "integer",
                        "description": "Version (integer starting at 1, incremented on save)"
                    },
                    "prompt_id": {
                        "type": "string",
                        "description": "Unique identifier formatted as 'pmpt_<48-digit-hash>'"
                    },
                    "variables": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        },
                        "description": "List of prompt variable names that can be used in the prompt template"
                    },
                    "is_default": {
                        "type": "boolean",
                        "default": false,
                        "description": "Boolean indicating whether this version is the default version for this prompt"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "version",
                    "prompt_id",
                    "variables",
                    "is_default"
                ],
                "title": "Prompt",
                "description": "A prompt resource representing a stored OpenAI Compatible prompt template in Llama Stack."
            },
            "OpenAIDeleteResponseObject": {
                "type": "object",
                "properties": {
@ -10296,7 +10662,8 @@
                            "scoring_function",
                            "benchmark",
                            "tool",
-                            "tool_group"
+                            "tool_group",
                            "prompt"
                        ],
                        "const": "benchmark",
                        "default": "benchmark",
@ -10923,7 +11290,8 @@
                            "scoring_function",
                            "benchmark",
                            "tool",
-                            "tool_group"
+                            "tool_group",
                            "prompt"
                        ],
                        "const": "dataset",
                        "default": "dataset",
@ -11073,7 +11441,8 @@
                            "scoring_function",
                            "benchmark",
                            "tool",
-                            "tool_group"
+                            "tool_group",
                            "prompt"
                        ],
                        "const": "model",
                        "default": "model",
@ -11338,7 +11707,8 @@
                            "scoring_function",
                            "benchmark",
                            "tool",
-                            "tool_group"
+                            "tool_group",
                            "prompt"
                        ],
                        "const": "scoring_function",
                        "default": "scoring_function",
@ -11446,7 +11816,8 @@
                            "scoring_function",
                            "benchmark",
                            "tool",
-                            "tool_group"
+                            "tool_group",
                            "prompt"
                        ],
                        "const": "shield",
                        "default": "shield",
@ -11691,7 +12062,8 @@
                            "scoring_function",
                            "benchmark",
                            "tool",
-                            "tool_group"
+                            "tool_group",
                            "prompt"
                        ],
                        "const": "tool",
                        "default": "tool",
@ -11773,7 +12145,8 @@
                            "scoring_function",
                            "benchmark",
                            "tool",
-                            "tool_group"
+                            "tool_group",
                            "prompt"
                        ],
                        "const": "tool_group",
                        "default": "tool_group",
@ -12067,7 +12440,8 @@
                            "scoring_function",
                            "benchmark",
                            "tool",
-                            "tool_group"
+                            "tool_group",
                            "prompt"
                        ],
                        "const": "vector_db",
                        "default": "vector_db",
@ -12882,6 +13256,23 @@
                "title": "OpenAIResponseObjectWithInput",
                "description": "OpenAI response object extended with input context information."
            },
            "ListPromptsResponse": {
                "type": "object",
                "properties": {
                    "data": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/Prompt"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "data"
                ],
                "title": "ListPromptsResponse",
                "description": "Response model to list prompts."
            },
            "ListProvidersResponse": {
                "type": "object",
                "properties": {
@ -17129,6 +17520,20 @@
                "title": "ScoreBatchResponse",
                "description": "Response from batch scoring operations on datasets."
            },
            "SetDefaultVersionRequest": {
                "type": "object",
                "properties": {
                    "version": {
                        "type": "integer",
                        "description": "The version to set as default."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "version"
                ],
                "title": "SetDefaultVersionRequest"
            },
            "AlgorithmConfig": {
                "oneOf": [
                    {
@ -17413,6 +17818,37 @@
                "title": "SyntheticDataGenerationResponse",
                "description": "Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold."
            },
            "UpdatePromptRequest": {
                "type": "object",
                "properties": {
                    "prompt": {
                        "type": "string",
                        "description": "The updated prompt text content."
                    },
                    "version": {
                        "type": "integer",
                        "description": "The current version of the prompt being updated."
                    },
                    "variables": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        },
                        "description": "Updated list of variable names that can be used in the prompt template."
                    },
                    "set_as_default": {
                        "type": "boolean",
                        "description": "Set the new version as the default (default=True)."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "prompt",
                    "version",
                    "set_as_default"
                ],
                "title": "UpdatePromptRequest"
            },
            "VersionInfo": {
                "type": "object",
                "properties": {
@ -17538,6 +17974,10 @@
        {
            "name": "PostTraining (Coming Soon)"
        },
        {
            "name": "Prompts",
            "x-displayName": "Protocol for prompt management operations."
        },
        {
            "name": "Providers",
            "x-displayName": "Providers API for inspecting, listing, and modifying providers and their configurations."
@ -17588,6 +18028,7 @@
                "Inspect",
                "Models",
                "PostTraining (Coming Soon)",
                "Prompts",
                "Providers",
                "Safety",
                "Scoring",
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -427,6 +427,58 @@ paths:
            schema:
              $ref: '#/components/schemas/CreateOpenaiResponseRequest'
        required: true
  /v1/prompts:
    get:
      responses:
        '200':
          description: >-
            A ListPromptsResponse containing all prompts.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListPromptsResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
      description: List all prompts.
      parameters: []
    post:
      responses:
        '200':
          description: The created Prompt resource.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Prompt'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
      description: Create a new prompt.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CreatePromptRequest'
        required: true
  /v1/agents/{agent_id}:
    get:
      responses:
@ -616,6 +668,103 @@ paths:
          required: true
          schema:
            type: string
  /v1/prompts/{prompt_id}:
    get:
      responses:
        '200':
          description: A Prompt resource.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Prompt'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
      description: >-
        Get a prompt by its identifier and optional version.
      parameters:
        - name: prompt_id
          in: path
          description: The identifier of the prompt to get.
          required: true
          schema:
            type: string
        - name: version
          in: query
          description: >-
            The version of the prompt to get (defaults to latest).
          required: false
          schema:
            type: integer
    post:
      responses:
        '200':
          description: >-
            The updated Prompt resource with incremented version.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Prompt'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
      description: >-
        Update an existing prompt (increments version).
      parameters:
        - name: prompt_id
          in: path
          description: The identifier of the prompt to update.
          required: true
          schema:
            type: string
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/UpdatePromptRequest'
        required: true
    delete:
      responses:
        '200':
          description: OK
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
      description: Delete a prompt.
      parameters:
        - name: prompt_id
          in: path
          description: The identifier of the prompt to delete.
          required: true
          schema:
            type: string
  /v1/inference/embeddings:
    post:
      responses:
@ -1983,6 +2132,37 @@ paths:
          required: false
          schema:
            $ref: '#/components/schemas/Order'
  /v1/prompts/{prompt_id}/versions:
    get:
      responses:
        '200':
          description: >-
            A ListPromptsResponse containing all versions of the prompt.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListPromptsResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
      description: List all versions of a specific prompt.
      parameters:
        - name: prompt_id
          in: path
          description: >-
            The identifier of the prompt to list versions for.
          required: true
          schema:
            type: string
  /v1/providers:
    get:
      responses:
@ -3546,6 +3726,43 @@ paths:
            schema:
              $ref: '#/components/schemas/ScoreBatchRequest'
        required: true
  /v1/prompts/{prompt_id}/set-default-version:
    post:
      responses:
        '200':
          description: >-
            The prompt with the specified version now set as default.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Prompt'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
      description: >-
        Set which version of a prompt should be the default in get_prompt (latest).
      parameters:
        - name: prompt_id
          in: path
          description: The identifier of the prompt.
          required: true
          schema:
            type: string
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/SetDefaultVersionRequest'
        required: true
  /v1/post-training/supervised-fine-tune:
    post:
      responses:
@ -7148,6 +7365,61 @@ components:
        - type
      title: >-
        OpenAIResponseObjectStreamResponseWebSearchCallSearching
    CreatePromptRequest:
      type: object
      properties:
        prompt:
          type: string
          description: >-
            The prompt text content with variable placeholders.
        variables:
          type: array
          items:
            type: string
          description: >-
            List of variable names that can be used in the prompt template.
      additionalProperties: false
      required:
        - prompt
      title: CreatePromptRequest
    Prompt:
      type: object
      properties:
        prompt:
          type: string
          description: >-
            The system prompt text with variable placeholders. Variables are only
            supported when using the Responses API.
        version:
          type: integer
          description: >-
            Version (integer starting at 1, incremented on save)
        prompt_id:
          type: string
          description: >-
            Unique identifier formatted as 'pmpt_<48-digit-hash>'
        variables:
          type: array
          items:
            type: string
          description: >-
            List of prompt variable names that can be used in the prompt template
        is_default:
          type: boolean
          default: false
          description: >-
            Boolean indicating whether this version is the default version for this
            prompt
      additionalProperties: false
      required:
        - version
        - prompt_id
        - variables
        - is_default
      title: Prompt
      description: >-
        A prompt resource representing a stored OpenAI Compatible prompt template
        in Llama Stack.
    OpenAIDeleteResponseObject:
      type: object
      properties:
@ -7621,6 +7893,7 @@ components:
            - benchmark
            - tool
            - tool_group
            - prompt
          const: benchmark
          default: benchmark
          description: The resource type, always benchmark
@ -8107,6 +8380,7 @@ components:
            - benchmark
            - tool
            - tool_group
            - prompt
          const: dataset
          default: dataset
          description: >-
@ -8219,6 +8493,7 @@ components:
            - benchmark
            - tool
            - tool_group
            - prompt
          const: model
          default: model
          description: >-
@ -8410,6 +8685,7 @@ components:
            - benchmark
            - tool
            - tool_group
            - prompt
          const: scoring_function
          default: scoring_function
          description: >-
@ -8486,6 +8762,7 @@ components:
            - benchmark
            - tool
            - tool_group
            - prompt
          const: shield
          default: shield
          description: The resource type, always shield
@ -8665,6 +8942,7 @@ components:
            - benchmark
            - tool
            - tool_group
            - prompt
          const: tool
          default: tool
          description: Type of resource, always 'tool'
@ -8723,6 +9001,7 @@ components:
            - benchmark
            - tool
            - tool_group
            - prompt
          const: tool_group
          default: tool_group
          description: Type of resource, always 'tool_group'
@ -8951,6 +9230,7 @@ components:
            - benchmark
            - tool
            - tool_group
            - prompt
          const: vector_db
          default: vector_db
          description: >-
@ -9577,6 +9857,18 @@ components:
      title: OpenAIResponseObjectWithInput
      description: >-
        OpenAI response object extended with input context information.
    ListPromptsResponse:
      type: object
      properties:
        data:
          type: array
          items:
            $ref: '#/components/schemas/Prompt'
      additionalProperties: false
      required:
        - data
      title: ListPromptsResponse
      description: Response model to list prompts.
    ListProvidersResponse:
      type: object
      properties:
@ -12723,6 +13015,16 @@ components:
      title: ScoreBatchResponse
      description: >-
        Response from batch scoring operations on datasets.
    SetDefaultVersionRequest:
      type: object
      properties:
        version:
          type: integer
          description: The version to set as default.
      additionalProperties: false
      required:
        - version
      title: SetDefaultVersionRequest
    AlgorithmConfig:
      oneOf:
        - $ref: '#/components/schemas/LoraFinetuningConfig'
@ -12919,6 +13221,32 @@ components:
      description: >-
        Response from the synthetic data generation. Batch of (prompt, response, score)
        tuples that pass the threshold.
    UpdatePromptRequest:
      type: object
      properties:
        prompt:
          type: string
          description: The updated prompt text content.
        version:
          type: integer
          description: >-
            The current version of the prompt being updated.
        variables:
          type: array
          items:
            type: string
          description: >-
            Updated list of variable names that can be used in the prompt template.
        set_as_default:
          type: boolean
          description: >-
            Set the new version as the default (default=True).
      additionalProperties: false
      required:
        - prompt
        - version
        - set_as_default
      title: UpdatePromptRequest
    VersionInfo:
      type: object
      properties:
@ -13030,6 +13358,9 @@ tags:
  - name: Inspect
  - name: Models
  - name: PostTraining (Coming Soon)
  - name: Prompts
    x-displayName: >-
      Protocol for prompt management operations.
  - name: Providers
    x-displayName: >-
      Providers API for inspecting, listing, and modifying providers and their configurations.
@ -13057,6 +13388,7 @@ x-tagGroups:
      - Inspect
      - Models
      - PostTraining (Coming Soon)
      - Prompts
      - Providers
      - Safety
      - Scoring
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -131,6 +131,7 @@ html_static_path = ["../_static"]
 def setup(app):
    app.add_css_file("css/my_theme.css")
    app.add_js_file("js/detect_theme.js")
    app.add_js_file("js/horizontal_nav.js")
    app.add_js_file("js/keyboard_shortcuts.js")
    def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@ -35,5 +35,5 @@ testing/record-replay
 ### Benchmarking
-```{include} ../../../docs/source/distributions/k8s-benchmark/README.md
+```{include} ../../../benchmarking/k8s-benchmark/README.md
 ```
--- a/docs/source/contributing/testing/record-replay.md
+++ b/docs/source/contributing/testing/record-replay.md
@ -40,18 +40,15 @@ The system patches OpenAI and Ollama client methods to intercept calls before th
 ### Storage Architecture
-Recordings use a two-tier storage system optimized for both speed and debuggability:
+Recordings are stored as JSON files in the recording directory. They are looked up by their request hash.
 ```
 recordings/
 ├── index.sqlite          # Fast lookup by request hash
 └── responses/
    ├── abc123def456.json  # Individual response files
    └── def789ghi012.json
 ```
 **SQLite index** enables O(log n) hash lookups and metadata queries without loading response bodies.
 **JSON files** store complete request/response pairs in human-readable format for debugging.
 ## Recording Modes
@ -166,8 +163,8 @@ This preserves type safety - when replayed, you get the same Pydantic objects wi
 Control recording behavior globally:
 ```bash
-export LLAMA_STACK_TEST_INFERENCE_MODE=replay
+export LLAMA_STACK_TEST_INFERENCE_MODE=replay   # this is the default
-export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings
+export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings   # default is tests/integration/recordings
 pytest tests/integration/
 ```
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@ -354,6 +354,47 @@ You can easily validate a request by running:
 curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers
 ```
 #### Kubernetes Authentication Provider
 The server can be configured to use Kubernetes SelfSubjectReview API to validate tokens directly against the Kubernetes API server:
 ```yaml
 server:
  auth:
    provider_config:
      type: "kubernetes"
      api_server_url: "https://kubernetes.default.svc"
      claims_mapping:
        username: "roles"
        groups: "roles"
        uid: "uid_attr"
      verify_tls: true
      tls_cafile: "/path/to/ca.crt"
 ```
 Configuration options:
 - `api_server_url`: The Kubernetes API server URL (e.g., https://kubernetes.default.svc:6443)
 - `verify_tls`: Whether to verify TLS certificates (default: true)
 - `tls_cafile`: Path to CA certificate file for TLS verification
 - `claims_mapping`: Mapping of Kubernetes user claims to access attributes
 The provider validates tokens by sending a SelfSubjectReview request to the Kubernetes API server at `/apis/authentication.k8s.io/v1/selfsubjectreviews`. The provider extracts user information from the response:
 - Username from the `userInfo.username` field
 - Groups from the `userInfo.groups` field
 - UID from the `userInfo.uid` field
 To obtain a token for testing:
 ```bash
 kubectl create namespace llama-stack
 kubectl create serviceaccount llama-stack-auth -n llama-stack
 kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
 ```
 You can validate a request by running:
 ```bash
 curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers
 ```
 #### GitHub Token Provider
 Validates GitHub personal access tokens or OAuth tokens directly:
 ```yaml
--- a/docs/source/distributions/k8s/stack-configmap.yaml
+++ b/docs/source/distributions/k8s/stack-configmap.yaml
@ -1,137 +1,55 @@
 apiVersion: v1
 data:
-  stack_run_config.yaml: |
+  stack_run_config.yaml: "version: '2'\nimage_name: kubernetes-demo\napis:\n- agents\n-
-    version: '2'
+    inference\n- files\n- safety\n- telemetry\n- tool_runtime\n- vector_io\nproviders:\n
-    image_name: kubernetes-demo
+    \ inference:\n  - provider_id: vllm-inference\n    provider_type: remote::vllm\n
-    apis:
+    \   config:\n      url: ${env.VLLM_URL:=http://localhost:8000/v1}\n      max_tokens:
-    - agents
+    ${env.VLLM_MAX_TOKENS:=4096}\n      api_token: ${env.VLLM_API_TOKEN:=fake}\n      tls_verify:
-    - inference
+    ${env.VLLM_TLS_VERIFY:=true}\n  - provider_id: vllm-safety\n    provider_type:
-    - safety
+    remote::vllm\n    config:\n      url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}\n
-    - telemetry
+    \     max_tokens: ${env.VLLM_MAX_TOKENS:=4096}\n      api_token: ${env.VLLM_API_TOKEN:=fake}\n
-    - tool_runtime
+    \     tls_verify: ${env.VLLM_TLS_VERIFY:=true}\n  - provider_id: sentence-transformers\n
-    - vector_io
+    \   provider_type: inline::sentence-transformers\n    config: {}\n  vector_io:\n
-    providers:
+    \ - provider_id: ${env.ENABLE_CHROMADB:+chromadb}\n    provider_type: remote::chromadb\n
-      inference:
+    \   config:\n      url: ${env.CHROMADB_URL:=}\n      kvstore:\n        type: postgres\n
-      - provider_id: vllm-inference
+    \       host: ${env.POSTGRES_HOST:=localhost}\n        port: ${env.POSTGRES_PORT:=5432}\n
-        provider_type: remote::vllm
+    \       db: ${env.POSTGRES_DB:=llamastack}\n        user: ${env.POSTGRES_USER:=llamastack}\n
-        config:
+    \       password: ${env.POSTGRES_PASSWORD:=llamastack}\n  files:\n  - provider_id:
-          url: ${env.VLLM_URL:=http://localhost:8000/v1}
+    meta-reference-files\n    provider_type: inline::localfs\n    config:\n      storage_dir:
-          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
+    ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}\n      metadata_store:\n
-          api_token: ${env.VLLM_API_TOKEN:=fake}
+    \       type: sqlite\n        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
-          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
+    \ \n  safety:\n  - provider_id: llama-guard\n    provider_type: inline::llama-guard\n
-      - provider_id: vllm-safety
+    \   config:\n      excluded_categories: []\n  agents:\n  - provider_id: meta-reference\n
-        provider_type: remote::vllm
+    \   provider_type: inline::meta-reference\n    config:\n      persistence_store:\n
-        config:
+    \       type: postgres\n        host: ${env.POSTGRES_HOST:=localhost}\n        port:
-          url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
+    ${env.POSTGRES_PORT:=5432}\n        db: ${env.POSTGRES_DB:=llamastack}\n        user:
-          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
+    ${env.POSTGRES_USER:=llamastack}\n        password: ${env.POSTGRES_PASSWORD:=llamastack}\n
-          api_token: ${env.VLLM_API_TOKEN:=fake}
+    \     responses_store:\n        type: postgres\n        host: ${env.POSTGRES_HOST:=localhost}\n
-          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
+    \       port: ${env.POSTGRES_PORT:=5432}\n        db: ${env.POSTGRES_DB:=llamastack}\n
-      - provider_id: sentence-transformers
+    \       user: ${env.POSTGRES_USER:=llamastack}\n        password: ${env.POSTGRES_PASSWORD:=llamastack}\n
-        provider_type: inline::sentence-transformers
+    \ telemetry:\n  - provider_id: meta-reference\n    provider_type: inline::meta-reference\n
-        config: {}
+    \   config:\n      service_name: \"${env.OTEL_SERVICE_NAME:=\\u200B}\"\n      sinks:
-      vector_io:
+    ${env.TELEMETRY_SINKS:=console}\n  tool_runtime:\n  - provider_id: brave-search\n
-      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
+    \   provider_type: remote::brave-search\n    config:\n      api_key: ${env.BRAVE_SEARCH_API_KEY:+}\n
-        provider_type: remote::chromadb
+    \     max_results: 3\n  - provider_id: tavily-search\n    provider_type: remote::tavily-search\n
-        config:
+    \   config:\n      api_key: ${env.TAVILY_SEARCH_API_KEY:+}\n      max_results:
-          url: ${env.CHROMADB_URL:=}
+    3\n  - provider_id: rag-runtime\n    provider_type: inline::rag-runtime\n    config:
-          kvstore:
+    {}\n  - provider_id: model-context-protocol\n    provider_type: remote::model-context-protocol\n
-            type: postgres
+    \   config: {}\nmetadata_store:\n  type: postgres\n  host: ${env.POSTGRES_HOST:=localhost}\n
-            host: ${env.POSTGRES_HOST:=localhost}
+    \ port: ${env.POSTGRES_PORT:=5432}\n  db: ${env.POSTGRES_DB:=llamastack}\n  user:
-            port: ${env.POSTGRES_PORT:=5432}
+    ${env.POSTGRES_USER:=llamastack}\n  password: ${env.POSTGRES_PASSWORD:=llamastack}\n
-            db: ${env.POSTGRES_DB:=llamastack}
+    \ table_name: llamastack_kvstore\ninference_store:\n  type: postgres\n  host:
-            user: ${env.POSTGRES_USER:=llamastack}
+    ${env.POSTGRES_HOST:=localhost}\n  port: ${env.POSTGRES_PORT:=5432}\n  db: ${env.POSTGRES_DB:=llamastack}\n
-            password: ${env.POSTGRES_PASSWORD:=llamastack}
+    \ user: ${env.POSTGRES_USER:=llamastack}\n  password: ${env.POSTGRES_PASSWORD:=llamastack}\nmodels:\n-
-      safety:
+    metadata:\n    embedding_dimension: 384\n  model_id: all-MiniLM-L6-v2\n  provider_id:
-      - provider_id: llama-guard
+    sentence-transformers\n  model_type: embedding\n- metadata: {}\n  model_id: ${env.INFERENCE_MODEL}\n
-        provider_type: inline::llama-guard
+    \ provider_id: vllm-inference\n  model_type: llm\n- metadata: {}\n  model_id:
-        config:
+    ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\n  provider_id: vllm-safety\n
-          excluded_categories: []
+    \ model_type: llm\nshields:\n- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\nvector_dbs:
-      agents:
+    []\ndatasets: []\nscoring_fns: []\nbenchmarks: []\ntool_groups:\n- toolgroup_id:
-      - provider_id: meta-reference
+    builtin::websearch\n  provider_id: tavily-search\n- toolgroup_id: builtin::rag\n
-        provider_type: inline::meta-reference
+    \ provider_id: rag-runtime\nserver:\n  port: 8321\n  auth:\n    provider_config:\n
-        config:
+    \     type: github_token\n"
          persistence_store:
            type: postgres
            host: ${env.POSTGRES_HOST:=localhost}
            port: ${env.POSTGRES_PORT:=5432}
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
          responses_store:
            type: postgres
            host: ${env.POSTGRES_HOST:=localhost}
            port: ${env.POSTGRES_PORT:=5432}
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
      telemetry:
      - provider_id: meta-reference
        provider_type: inline::meta-reference
        config:
          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
          sinks: ${env.TELEMETRY_SINKS:=console}
      tool_runtime:
      - provider_id: brave-search
        provider_type: remote::brave-search
        config:
          api_key: ${env.BRAVE_SEARCH_API_KEY:+}
          max_results: 3
      - provider_id: tavily-search
        provider_type: remote::tavily-search
        config:
          api_key: ${env.TAVILY_SEARCH_API_KEY:+}
          max_results: 3
      - provider_id: rag-runtime
        provider_type: inline::rag-runtime
        config: {}
      - provider_id: model-context-protocol
        provider_type: remote::model-context-protocol
        config: {}
    metadata_store:
      type: postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
      table_name: llamastack_kvstore
    inference_store:
      type: postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
    models:
    - metadata:
        embedding_dimension: 384
      model_id: all-MiniLM-L6-v2
      provider_id: sentence-transformers
      model_type: embedding
    - metadata: {}
      model_id: ${env.INFERENCE_MODEL}
      provider_id: vllm-inference
      model_type: llm
    - metadata: {}
      model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
      provider_id: vllm-safety
      model_type: llm
    shields:
    - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
    vector_dbs: []
    datasets: []
    scoring_fns: []
    benchmarks: []
    tool_groups:
    - toolgroup_id: builtin::websearch
      provider_id: tavily-search
    - toolgroup_id: builtin::rag
      provider_id: rag-runtime
    server:
      port: 8321
      auth:
        provider_config:
          type: github_token
 kind: ConfigMap
 metadata:
  creationTimestamp: null
--- a/docs/source/distributions/k8s/stack_run_config.yaml
+++ b/docs/source/distributions/k8s/stack_run_config.yaml
@ -3,6 +3,7 @@ image_name: kubernetes-demo
 apis:
 - agents
 - inference
 - files
 - safety
 - telemetry
 - tool_runtime
@ -38,6 +39,14 @@ providers:
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
      metadata_store:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/docs/source/getting_started/demo_script.py
+++ b/docs/source/getting_started/demo_script.py
@ -18,12 +18,13 @@ embedding_model_id = (
 ).identifier
 embedding_dimension = em.metadata["embedding_dimension"]
-_ = client.vector_dbs.register(
+vector_db = client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model=embedding_model_id,
    embedding_dimension=embedding_dimension,
    provider_id="faiss",
 )
 vector_db_id = vector_db.identifier
 source = "https://www.paulgraham.com/greatwork.html"
 print("rag_tool> Ingesting document:", source)
 document = RAGDocument(
@ -35,7 +36,7 @@ document = RAGDocument(
 client.tool_runtime.rag_tool.insert(
    documents=[document],
    vector_db_id=vector_db_id,
-    chunk_size_in_tokens=50,
+    chunk_size_in_tokens=100,
 )
 agent = Agent(
    client,
--- a/docs/source/providers/external/external-providers-list.md
+++ b/docs/source/providers/external/external-providers-list.md
@ -8,3 +8,4 @@ Here's a list of known external providers that you can use with Llama Stack:
 | KubeFlow Pipelines | Train models with KubeFlow Pipelines | Post Training | Inline **and** Remote | [llama-stack-provider-kfp-trainer](https://github.com/opendatahub-io/llama-stack-provider-kfp-trainer) |
 | RamaLama | Inference models with RamaLama | Inference | Remote | [ramalama-stack](https://github.com/containers/ramalama-stack) |
 | TrustyAI LM-Eval | Evaluate models with TrustyAI LM-Eval | Eval | Remote | [llama-stack-provider-lmeval](https://github.com/trustyai-explainability/llama-stack-provider-lmeval) |
 | MongoDB | VectorIO with MongoDB | Vector_IO | Remote | [mongodb-llama-stack](https://github.com/mongodb-partners/mongodb-llama-stack) |
--- a/docs/source/providers/inference/remote_bedrock.md
+++ b/docs/source/providers/inference/remote_bedrock.md
@ -15,8 +15,8 @@ AWS Bedrock inference provider for accessing various AI models through AWS's man
 | `profile_name` | `str \| None` | No |  | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE |
 | `total_max_attempts` | `int \| None` | No |  | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS |
 | `retry_mode` | `str \| None` | No |  | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE |
-| `connect_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
+| `connect_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
-| `read_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
+| `read_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
 | `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). |
 ## Sample Configuration
--- a/docs/source/providers/safety/remote_bedrock.md
+++ b/docs/source/providers/safety/remote_bedrock.md
@ -15,8 +15,8 @@ AWS Bedrock safety provider for content moderation using AWS's safety services.
 | `profile_name` | `str \| None` | No |  | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE |
 | `total_max_attempts` | `int \| None` | No |  | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS |
 | `retry_mode` | `str \| None` | No |  | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE |
-| `connect_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
+| `connect_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
-| `read_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
+| `read_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
 | `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). |
 ## Sample Configuration
--- a/llama_stack/apis/common/errors.py
+++ b/llama_stack/apis/common/errors.py
@ -79,3 +79,10 @@ class ConflictError(ValueError):
    def __init__(self, message: str) -> None:
        super().__init__(message)
 class TokenValidationError(ValueError):
    """raised when token validation fails during authentication"""
    def __init__(self, message: str) -> None:
        super().__init__(message)
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@ -102,6 +102,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
    :cvar benchmarks: Benchmark suite management
    :cvar tool_groups: Tool group organization
    :cvar files: File storage and management
    :cvar prompts: Prompt versions and management
    :cvar inspect: Built-in system inspection and introspection
    """
@ -127,6 +128,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
    benchmarks = "benchmarks"
    tool_groups = "tool_groups"
    files = "files"
    prompts = "prompts"
    # built-in API
    inspect = "inspect"
--- a/llama_stack/apis/prompts/init.py
+++ b/llama_stack/apis/prompts/init.py
@ -0,0 +1,9 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .prompts import ListPromptsResponse, Prompt, Prompts
 __all__ = ["Prompt", "Prompts", "ListPromptsResponse"]
--- a/llama_stack/apis/prompts/prompts.py
+++ b/llama_stack/apis/prompts/prompts.py
@ -0,0 +1,189 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import re
 import secrets
 from typing import Protocol, runtime_checkable
 from pydantic import BaseModel, Field, field_validator, model_validator
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
@json_schema_type
 class Prompt(BaseModel):
    """A prompt resource representing a stored OpenAI Compatible prompt template in Llama Stack.
    :param prompt: The system prompt text with variable placeholders. Variables are only supported when using the Responses API.
    :param version: Version (integer starting at 1, incremented on save)
    :param prompt_id: Unique identifier formatted as 'pmpt_<48-digit-hash>'
    :param variables: List of prompt variable names that can be used in the prompt template
    :param is_default: Boolean indicating whether this version is the default version for this prompt
    """
    prompt: str | None = Field(default=None, description="The system prompt with variable placeholders")
    version: int = Field(description="Version (integer starting at 1, incremented on save)", ge=1)
    prompt_id: str = Field(description="Unique identifier in format 'pmpt_<48-digit-hash>'")
    variables: list[str] = Field(
        default_factory=list, description="List of variable names that can be used in the prompt template"
    )
    is_default: bool = Field(
        default=False, description="Boolean indicating whether this version is the default version"
    )
    @field_validator("prompt_id")
    @classmethod
    def validate_prompt_id(cls, prompt_id: str) -> str:
        if not isinstance(prompt_id, str):
            raise TypeError("prompt_id must be a string in format 'pmpt_<48-digit-hash>'")
        if not prompt_id.startswith("pmpt_"):
            raise ValueError("prompt_id must start with 'pmpt_' prefix")
        hex_part = prompt_id[5:]
        if len(hex_part) != 48:
            raise ValueError("prompt_id must be in format 'pmpt_<48-digit-hash>' (48 lowercase hex chars)")
        for char in hex_part:
            if char not in "0123456789abcdef":
                raise ValueError("prompt_id hex part must contain only lowercase hex characters [0-9a-f]")
        return prompt_id
    @field_validator("version")
    @classmethod
    def validate_version(cls, prompt_version: int) -> int:
        if prompt_version < 1:
            raise ValueError("version must be >= 1")
        return prompt_version
    @model_validator(mode="after")
    def validate_prompt_variables(self):
        """Validate that all variables used in the prompt are declared in the variables list."""
        if not self.prompt:
            return self
        prompt_variables = set(re.findall(r"{{\s*(\w+)\s*}}", self.prompt))
        declared_variables = set(self.variables)
        undeclared = prompt_variables - declared_variables
        if undeclared:
            raise ValueError(f"Prompt contains undeclared variables: {sorted(undeclared)}")
        return self
    @classmethod
    def generate_prompt_id(cls) -> str:
        # Generate 48 hex characters (24 bytes)
        random_bytes = secrets.token_bytes(24)
        hex_string = random_bytes.hex()
        return f"pmpt_{hex_string}"
 class ListPromptsResponse(BaseModel):
    """Response model to list prompts."""
    data: list[Prompt]
@runtime_checkable
@trace_protocol
 class Prompts(Protocol):
    """Protocol for prompt management operations."""
    @webmethod(route="/prompts", method="GET")
    async def list_prompts(self) -> ListPromptsResponse:
        """List all prompts.
        :returns: A ListPromptsResponse containing all prompts.
        """
        ...
    @webmethod(route="/prompts/{prompt_id}/versions", method="GET")
    async def list_prompt_versions(
        self,
        prompt_id: str,
    ) -> ListPromptsResponse:
        """List all versions of a specific prompt.
        :param prompt_id: The identifier of the prompt to list versions for.
        :returns: A ListPromptsResponse containing all versions of the prompt.
        """
        ...
    @webmethod(route="/prompts/{prompt_id}", method="GET")
    async def get_prompt(
        self,
        prompt_id: str,
        version: int | None = None,
    ) -> Prompt:
        """Get a prompt by its identifier and optional version.
        :param prompt_id: The identifier of the prompt to get.
        :param version: The version of the prompt to get (defaults to latest).
        :returns: A Prompt resource.
        """
        ...
    @webmethod(route="/prompts", method="POST")
    async def create_prompt(
        self,
        prompt: str,
        variables: list[str] | None = None,
    ) -> Prompt:
        """Create a new prompt.
        :param prompt: The prompt text content with variable placeholders.
        :param variables: List of variable names that can be used in the prompt template.
        :returns: The created Prompt resource.
        """
        ...
    @webmethod(route="/prompts/{prompt_id}", method="PUT")
    async def update_prompt(
        self,
        prompt_id: str,
        prompt: str,
        version: int,
        variables: list[str] | None = None,
        set_as_default: bool = True,
    ) -> Prompt:
        """Update an existing prompt (increments version).
        :param prompt_id: The identifier of the prompt to update.
        :param prompt: The updated prompt text content.
        :param version: The current version of the prompt being updated.
        :param variables: Updated list of variable names that can be used in the prompt template.
        :param set_as_default: Set the new version as the default (default=True).
        :returns: The updated Prompt resource with incremented version.
        """
        ...
    @webmethod(route="/prompts/{prompt_id}", method="DELETE")
    async def delete_prompt(
        self,
        prompt_id: str,
    ) -> None:
        """Delete a prompt.
        :param prompt_id: The identifier of the prompt to delete.
        """
        ...
    @webmethod(route="/prompts/{prompt_id}/set-default-version", method="PUT")
    async def set_default_version(
        self,
        prompt_id: str,
        version: int,
    ) -> Prompt:
        """Set which version of a prompt should be the default in get_prompt (latest).
        :param prompt_id: The identifier of the prompt.
        :param version: The version to set as default.
        :returns: The prompt with the specified version now set as default.
        """
        ...
--- a/llama_stack/apis/resource.py
+++ b/llama_stack/apis/resource.py
@ -19,6 +19,7 @@ class ResourceType(StrEnum):
    benchmark = "benchmark"
    tool = "tool"
    tool_group = "tool_group"
    prompt = "prompt"
 class Resource(BaseModel):
--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@ -45,6 +45,7 @@ from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.core.utils.exec import formulate_run_args, run_command
 from llama_stack.core.utils.image_types import LlamaStackImageType
 from llama_stack.providers.datatypes import Api
 from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
 DISTRIBS_PATH = Path(__file__).parent.parent.parent / "distributions"
@ -294,6 +295,12 @@ def _generate_run_config(
        if build_config.external_providers_dir
        else EXTERNAL_PROVIDERS_DIR,
    )
    if not run_config.inference_store:
        run_config.inference_store = SqliteSqlStoreConfig(
            **SqliteSqlStoreConfig.sample_run_config(
                __distro_dir__=(DISTRIBS_BASE_DIR / image_name).as_posix(), db_name="inference_store.db"
            )
        )
    # build providers dict
    provider_registry = get_provider_registry(build_config)
    for api in apis:
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@ -7,6 +7,7 @@
 from enum import StrEnum
 from pathlib import Path
 from typing import Annotated, Any, Literal, Self
 from urllib.parse import urlparse
 from pydantic import BaseModel, Field, field_validator, model_validator
@ -212,6 +213,7 @@ class AuthProviderType(StrEnum):
    OAUTH2_TOKEN = "oauth2_token"
    GITHUB_TOKEN = "github_token"
    CUSTOM = "custom"
    KUBERNETES = "kubernetes"
 class OAuth2TokenAuthConfig(BaseModel):
@ -282,8 +284,45 @@ class GitHubTokenAuthConfig(BaseModel):
    )
 class KubernetesAuthProviderConfig(BaseModel):
    """Configuration for Kubernetes authentication provider."""
    type: Literal[AuthProviderType.KUBERNETES] = AuthProviderType.KUBERNETES
    api_server_url: str = Field(
        default="https://kubernetes.default.svc",
        description="Kubernetes API server URL (e.g., https://api.cluster.domain:6443)",
    )
    verify_tls: bool = Field(default=True, description="Whether to verify TLS certificates")
    tls_cafile: Path | None = Field(default=None, description="Path to CA certificate file for TLS verification")
    claims_mapping: dict[str, str] = Field(
        default_factory=lambda: {
            "username": "roles",
            "groups": "roles",
        },
        description="Mapping of Kubernetes user claims to access attributes",
    )
    @field_validator("api_server_url")
    @classmethod
    def validate_api_server_url(cls, v):
        parsed = urlparse(v)
        if not parsed.scheme or not parsed.netloc:
            raise ValueError(f"api_server_url must be a valid URL with scheme and host: {v}")
        if parsed.scheme not in ["http", "https"]:
            raise ValueError(f"api_server_url scheme must be http or https: {v}")
        return v
    @field_validator("claims_mapping")
    @classmethod
    def validate_claims_mapping(cls, v):
        for key, value in v.items():
            if not value:
                raise ValueError(f"claims_mapping value cannot be empty: {key}")
        return v
 AuthProviderConfig = Annotated[
-    OAuth2TokenAuthConfig | GitHubTokenAuthConfig | CustomAuthConfig,
+    OAuth2TokenAuthConfig | GitHubTokenAuthConfig | CustomAuthConfig | KubernetesAuthProviderConfig,
    Field(discriminator="type"),
 ]
@ -392,6 +431,12 @@ class ServerConfig(BaseModel):
    )
 class InferenceStoreConfig(BaseModel):
    sql_store_config: SqlStoreConfig
    max_write_queue_size: int = Field(default=10000, description="Max queued writes for inference store")
    num_writers: int = Field(default=4, description="Number of concurrent background writers")
 class StackRunConfig(BaseModel):
    version: int = LLAMA_STACK_RUN_CONFIG_VERSION
@ -425,11 +470,12 @@ Configuration for the persistence store used by the distribution registry. If no
 a default SQLite store will be used.""",
    )
-    inference_store: SqlStoreConfig | None = Field(
+    inference_store: InferenceStoreConfig | SqlStoreConfig | None = Field(
        default=None,
        description="""
-Configuration for the persistence store used by the inference API. If not specified,
+Configuration for the persistence store used by the inference API. Can be either a
-a default SQLite store will be used.""",
+InferenceStoreConfig (with queue tuning parameters) or a SqlStoreConfig (deprecated).
 If not specified, a default SQLite store will be used.""",
    )
    # registry of "resources" in the distribution
--- a/llama_stack/core/library_client.py
+++ b/llama_stack/core/library_client.py
@ -10,7 +10,6 @@ import json
 import logging  # allow-direct-logging
 import os
 import sys
 from concurrent.futures import ThreadPoolExecutor
 from enum import Enum
 from io import BytesIO
 from pathlib import Path
@ -148,7 +147,6 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
        self.async_client = AsyncLlamaStackAsLibraryClient(
            config_path_or_distro_name, custom_provider_registry, provider_data, skip_logger_removal
        )
        self.pool_executor = ThreadPoolExecutor(max_workers=4)
        self.provider_data = provider_data
        self.loop = asyncio.new_event_loop()
--- a/tests/integration/non_ci/responses/init.py
+++ b/tests/integration/non_ci/responses/init.py
--- a/llama_stack/core/prompts/prompts.py
+++ b/llama_stack/core/prompts/prompts.py
@ -0,0 +1,233 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import json
 from typing import Any
 from pydantic import BaseModel
 from llama_stack.apis.prompts import ListPromptsResponse, Prompt, Prompts
 from llama_stack.core.datatypes import StackRunConfig
 from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR
 from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl
 from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
 class PromptServiceConfig(BaseModel):
    """Configuration for the built-in prompt service.
    :param run_config: Stack run configuration containing distribution info
    """
    run_config: StackRunConfig
 async def get_provider_impl(config: PromptServiceConfig, deps: dict[Any, Any]):
    """Get the prompt service implementation."""
    impl = PromptServiceImpl(config, deps)
    await impl.initialize()
    return impl
 class PromptServiceImpl(Prompts):
    """Built-in prompt service implementation using KVStore."""
    def __init__(self, config: PromptServiceConfig, deps: dict[Any, Any]):
        self.config = config
        self.deps = deps
        self.kvstore: KVStore
    async def initialize(self) -> None:
        kvstore_config = SqliteKVStoreConfig(
            db_path=(DISTRIBS_BASE_DIR / self.config.run_config.image_name / "prompts.db").as_posix()
        )
        self.kvstore = await kvstore_impl(kvstore_config)
    def _get_default_key(self, prompt_id: str) -> str:
        """Get the KVStore key that stores the default version number."""
        return f"prompts:v1:{prompt_id}:default"
    async def _get_prompt_key(self, prompt_id: str, version: int | None = None) -> str:
        """Get the KVStore key for prompt data, returning default version if applicable."""
        if version:
            return self._get_version_key(prompt_id, str(version))
        default_key = self._get_default_key(prompt_id)
        resolved_version = await self.kvstore.get(default_key)
        if resolved_version is None:
            raise ValueError(f"Prompt {prompt_id}:default not found")
        return self._get_version_key(prompt_id, resolved_version)
    def _get_version_key(self, prompt_id: str, version: str) -> str:
        """Get the KVStore key for a specific prompt version."""
        return f"prompts:v1:{prompt_id}:{version}"
    def _get_list_key_prefix(self) -> str:
        """Get the key prefix for listing prompts."""
        return "prompts:v1:"
    def _serialize_prompt(self, prompt: Prompt) -> str:
        """Serialize a prompt to JSON string for storage."""
        return json.dumps(
            {
                "prompt_id": prompt.prompt_id,
                "prompt": prompt.prompt,
                "version": prompt.version,
                "variables": prompt.variables or [],
                "is_default": prompt.is_default,
            }
        )
    def _deserialize_prompt(self, data: str) -> Prompt:
        """Deserialize a prompt from JSON string."""
        obj = json.loads(data)
        return Prompt(
            prompt_id=obj["prompt_id"],
            prompt=obj["prompt"],
            version=obj["version"],
            variables=obj.get("variables", []),
            is_default=obj.get("is_default", False),
        )
    async def list_prompts(self) -> ListPromptsResponse:
        """List all prompts (default versions only)."""
        prefix = self._get_list_key_prefix()
        keys = await self.kvstore.keys_in_range(prefix, prefix + "\xff")
        prompts = []
        for key in keys:
            if key.endswith(":default"):
                try:
                    default_version = await self.kvstore.get(key)
                    if default_version:
                        prompt_id = key.replace(prefix, "").replace(":default", "")
                        version_key = self._get_version_key(prompt_id, default_version)
                        data = await self.kvstore.get(version_key)
                        if data:
                            prompt = self._deserialize_prompt(data)
                            prompts.append(prompt)
                except (json.JSONDecodeError, KeyError):
                    continue
        prompts.sort(key=lambda p: p.prompt_id or "", reverse=True)
        return ListPromptsResponse(data=prompts)
    async def get_prompt(self, prompt_id: str, version: int | None = None) -> Prompt:
        """Get a prompt by its identifier and optional version."""
        key = await self._get_prompt_key(prompt_id, version)
        data = await self.kvstore.get(key)
        if data is None:
            raise ValueError(f"Prompt {prompt_id}:{version if version else 'default'} not found")
        return self._deserialize_prompt(data)
    async def create_prompt(
        self,
        prompt: str,
        variables: list[str] | None = None,
    ) -> Prompt:
        """Create a new prompt."""
        if variables is None:
            variables = []
        prompt_obj = Prompt(
            prompt_id=Prompt.generate_prompt_id(),
            prompt=prompt,
            version=1,
            variables=variables,
        )
        version_key = self._get_version_key(prompt_obj.prompt_id, str(prompt_obj.version))
        data = self._serialize_prompt(prompt_obj)
        await self.kvstore.set(version_key, data)
        default_key = self._get_default_key(prompt_obj.prompt_id)
        await self.kvstore.set(default_key, str(prompt_obj.version))
        return prompt_obj
    async def update_prompt(
        self,
        prompt_id: str,
        prompt: str,
        version: int,
        variables: list[str] | None = None,
        set_as_default: bool = True,
    ) -> Prompt:
        """Update an existing prompt (increments version)."""
        if version < 1:
            raise ValueError("Version must be >= 1")
        if variables is None:
            variables = []
        prompt_versions = await self.list_prompt_versions(prompt_id)
        latest_prompt = max(prompt_versions.data, key=lambda x: int(x.version))
        if version and latest_prompt.version != version:
            raise ValueError(
                f"'{version}' is not the latest prompt version for prompt_id='{prompt_id}'. Use the latest version '{latest_prompt.version}' in request."
            )
        current_version = latest_prompt.version if version is None else version
        new_version = current_version + 1
        updated_prompt = Prompt(prompt_id=prompt_id, prompt=prompt, version=new_version, variables=variables)
        version_key = self._get_version_key(prompt_id, str(new_version))
        data = self._serialize_prompt(updated_prompt)
        await self.kvstore.set(version_key, data)
        if set_as_default:
            await self.set_default_version(prompt_id, new_version)
        return updated_prompt
    async def delete_prompt(self, prompt_id: str) -> None:
        """Delete a prompt and all its versions."""
        await self.get_prompt(prompt_id)
        prefix = f"prompts:v1:{prompt_id}:"
        keys = await self.kvstore.keys_in_range(prefix, prefix + "\xff")
        for key in keys:
            await self.kvstore.delete(key)
    async def list_prompt_versions(self, prompt_id: str) -> ListPromptsResponse:
        """List all versions of a specific prompt."""
        prefix = f"prompts:v1:{prompt_id}:"
        keys = await self.kvstore.keys_in_range(prefix, prefix + "\xff")
        default_version = None
        prompts = []
        for key in keys:
            data = await self.kvstore.get(key)
            if key.endswith(":default"):
                default_version = data
            else:
                if data:
                    prompt_obj = self._deserialize_prompt(data)
                    prompts.append(prompt_obj)
        if not prompts:
            raise ValueError(f"Prompt {prompt_id} not found")
        for prompt in prompts:
            prompt.is_default = str(prompt.version) == default_version
        prompts.sort(key=lambda x: x.version)
        return ListPromptsResponse(data=prompts)
    async def set_default_version(self, prompt_id: str, version: int) -> Prompt:
        """Set which version of a prompt should be the default, If not set. the default is the latest."""
        version_key = self._get_version_key(prompt_id, str(version))
        data = await self.kvstore.get(version_key)
        if data is None:
            raise ValueError(f"Prompt {prompt_id} version {version} not found")
        default_key = self._get_default_key(prompt_id)
        await self.kvstore.set(default_key, str(version))
        return self._deserialize_prompt(data)
--- a/llama_stack/core/resolver.py
+++ b/llama_stack/core/resolver.py
@ -19,6 +19,7 @@ from llama_stack.apis.inference import Inference, InferenceProvider
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
 from llama_stack.apis.post_training import PostTraining
 from llama_stack.apis.prompts import Prompts
 from llama_stack.apis.providers import Providers as ProvidersAPI
 from llama_stack.apis.safety import Safety
 from llama_stack.apis.scoring import Scoring
@ -93,6 +94,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
        Api.tool_groups: ToolGroups,
        Api.tool_runtime: ToolRuntime,
        Api.files: Files,
        Api.prompts: Prompts,
    }
    if external_apis:
@ -284,7 +286,15 @@ async def instantiate_providers(
        if provider.provider_id is None:
            continue
        try:
            deps = {a: impls[a] for a in provider.spec.api_dependencies}
        except KeyError as e:
            missing_api = e.args[0]
            raise RuntimeError(
                f"Failed to resolve '{provider.spec.api.value}' provider '{provider.provider_id}' of type '{provider.spec.provider_type}': "
                f"required dependency '{missing_api.value}' is not available. "
                f"Please add a '{missing_api.value}' provider to your configuration or check if the provider is properly configured."
            ) from e
        for a in provider.spec.optional_api_dependencies:
            if a in impls:
                deps[a] = impls[a]
--- a/llama_stack/core/routers/init.py
+++ b/llama_stack/core/routers/init.py
@ -78,7 +78,10 @@ async def get_auto_router_impl(
    # TODO: move pass configs to routers instead
    if api == Api.inference and run_config.inference_store:
-        inference_store = InferenceStore(run_config.inference_store, policy)
+        inference_store = InferenceStore(
            config=run_config.inference_store,
            policy=policy,
        )
        await inference_store.initialize()
        api_to_dep_impl["store"] = inference_store
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@ -63,7 +63,7 @@ from llama_stack.models.llama.llama3.chat_format import ChatFormat
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
 from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
 from llama_stack.providers.utils.inference.inference_store import InferenceStore
-from llama_stack.providers.utils.telemetry.tracing import get_current_span
+from llama_stack.providers.utils.telemetry.tracing import enqueue_event, get_current_span
 logger = get_logger(name=__name__, category="core::routers")
@ -90,6 +90,11 @@ class InferenceRouter(Inference):
    async def shutdown(self) -> None:
        logger.debug("InferenceRouter.shutdown")
        if self.store:
            try:
                await self.store.shutdown()
            except Exception as e:
                logger.warning(f"Error during InferenceStore shutdown: {e}")
    async def register_model(
        self,
@ -160,7 +165,7 @@ class InferenceRouter(Inference):
        metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model)
        if self.telemetry:
            for metric in metrics:
-                await self.telemetry.log_event(metric)
+                enqueue_event(metric)
        return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
    async def _count_tokens(
@ -431,7 +436,7 @@ class InferenceRouter(Inference):
                model=model_obj,
            )
            for metric in metrics:
-                await self.telemetry.log_event(metric)
+                enqueue_event(metric)
            # these metrics will show up in the client response.
            response.metrics = (
@ -527,7 +532,7 @@ class InferenceRouter(Inference):
        # Store the response with the ID that will be returned to the client
        if self.store:
-            await self.store.store_chat_completion(response, messages)
+            asyncio.create_task(self.store.store_chat_completion(response, messages))
        if self.telemetry:
            metrics = self._construct_metrics(
@ -537,7 +542,7 @@ class InferenceRouter(Inference):
                model=model_obj,
            )
            for metric in metrics:
-                await self.telemetry.log_event(metric)
+                enqueue_event(metric)
            # these metrics will show up in the client response.
            response.metrics = (
                metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
@ -664,7 +669,7 @@ class InferenceRouter(Inference):
                            "completion_tokens",
                            "total_tokens",
                        ]:  # Only log completion and total tokens
-                            await self.telemetry.log_event(metric)
+                            enqueue_event(metric)
                        # Return metrics in response
                        async_metrics = [
@ -710,7 +715,7 @@ class InferenceRouter(Inference):
            )
            for metric in completion_metrics:
                if metric.metric in ["completion_tokens", "total_tokens"]:  # Only log completion and total tokens
-                    await self.telemetry.log_event(metric)
+                    enqueue_event(metric)
            # Return metrics in response
            return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics]
@ -755,7 +760,7 @@ class InferenceRouter(Inference):
                            choices_data[idx] = {
                                "content_parts": [],
                                "tool_calls_builder": {},
-                                "finish_reason": None,
+                                "finish_reason": "stop",
                                "logprobs_content_parts": [],
                            }
                        current_choice_data = choices_data[idx]
@ -806,7 +811,7 @@ class InferenceRouter(Inference):
                            model=model,
                        )
                        for metric in metrics:
-                            await self.telemetry.log_event(metric)
+                            enqueue_event(metric)
                yield chunk
        finally:
@ -855,4 +860,4 @@ class InferenceRouter(Inference):
                    object="chat.completion",
                )
                logger.debug(f"InferenceRouter.completion_response: {final_response}")
-                await self.store.store_chat_completion(final_response, messages)
+                asyncio.create_task(self.store.store_chat_completion(final_response, messages))
--- a/llama_stack/core/routing_tables/vector_dbs.py
+++ b/llama_stack/core/routing_tables/vector_dbs.py
@ -53,6 +53,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
        vector_db_name: str | None = None,
    ) -> VectorDB:
        provider_vector_db_id = provider_vector_db_id or vector_db_id
        model = await lookup_model(self, embedding_model)
        if model is None:
            raise ModelNotFoundError(embedding_model)
@ -60,14 +61,33 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
            raise ModelTypeError(embedding_model, model.model_type, ModelType.embedding)
        if "embedding_dimension" not in model.metadata:
            raise ValueError(f"Model {embedding_model} does not have an embedding dimension")
        provider = self.impls_by_provider_id[provider_id]
        logger.warning(
            "VectorDB is being deprecated in future releases in favor of VectorStore. Please migrate your usage accordingly."
        )
        vector_store = await provider.openai_create_vector_store(
            name=vector_db_name or vector_db_id,
            embedding_model=embedding_model,
            embedding_dimension=model.metadata["embedding_dimension"],
            provider_id=provider_id,
            provider_vector_db_id=provider_vector_db_id,
        )
        vector_store_id = vector_store.id
        actual_provider_vector_db_id = provider_vector_db_id or vector_store_id
        logger.warning(
            f"Ignoring vector_db_id {vector_db_id} and using vector_store_id {vector_store_id} instead. Setting VectorDB {vector_db_id} to VectorDB.vector_db_name"
        )
        vector_db_data = {
-            "identifier": vector_db_id,
+            "identifier": vector_store_id,
            "type": ResourceType.vector_db.value,
            "provider_id": provider_id,
-            "provider_resource_id": provider_vector_db_id,
+            "provider_resource_id": actual_provider_vector_db_id,
            "embedding_model": embedding_model,
            "embedding_dimension": model.metadata["embedding_dimension"],
-            "vector_db_name": vector_db_name,
+            "vector_db_name": vector_store.name,
        }
        vector_db = TypeAdapter(VectorDBWithOwner).validate_python(vector_db_data)
        await self.register_object(vector_db)
--- a/llama_stack/core/server/auth_providers.py
+++ b/llama_stack/core/server/auth_providers.py
@ -8,16 +8,18 @@ import ssl
 import time
 from abc import ABC, abstractmethod
 from asyncio import Lock
-from urllib.parse import parse_qs, urlparse
+from urllib.parse import parse_qs, urljoin, urlparse
 import httpx
 from jose import jwt
 from pydantic import BaseModel, Field
 from llama_stack.apis.common.errors import TokenValidationError
 from llama_stack.core.datatypes import (
    AuthenticationConfig,
    CustomAuthConfig,
    GitHubTokenAuthConfig,
    KubernetesAuthProviderConfig,
    OAuth2TokenAuthConfig,
    User,
 )
@ -162,7 +164,7 @@ class OAuth2TokenAuthProvider(AuthProvider):
                    auth=auth,
                    timeout=10.0,  # Add a reasonable timeout
                )
-                if response.status_code != 200:
+                if response.status_code != httpx.codes.OK:
                    logger.warning(f"Token introspection failed with status code: {response.status_code}")
                    raise ValueError(f"Token introspection failed: {response.status_code}")
@ -272,7 +274,7 @@ class CustomAuthProvider(AuthProvider):
                    json=auth_request.model_dump(),
                    timeout=10.0,  # Add a reasonable timeout
                )
-                if response.status_code != 200:
+                if response.status_code != httpx.codes.OK:
                    logger.warning(f"Authentication failed with status code: {response.status_code}")
                    raise ValueError(f"Authentication failed: {response.status_code}")
@ -374,6 +376,89 @@ async def _get_github_user_info(access_token: str, github_api_base_url: str) ->
        }
 class KubernetesAuthProvider(AuthProvider):
    """
    Kubernetes authentication provider that validates tokens using the Kubernetes SelfSubjectReview API.
    This provider integrates with Kubernetes API server by using the
    /apis/authentication.k8s.io/v1/selfsubjectreviews endpoint to validate tokens and extract user information.
    """
    def __init__(self, config: KubernetesAuthProviderConfig):
        self.config = config
    def _httpx_verify_value(self) -> bool | str:
        """
        Build the value for httpx's `verify` parameter.
        - False disables verification.
        - Path string points to a CA bundle.
        - True uses system defaults.
        """
        if not self.config.verify_tls:
            return False
        if self.config.tls_cafile:
            return self.config.tls_cafile.as_posix()
        return True
    async def validate_token(self, token: str, scope: dict | None = None) -> User:
        """Validate a token using Kubernetes SelfSubjectReview API endpoint."""
        # Build the Kubernetes SelfSubjectReview API endpoint URL
        review_api_url = urljoin(self.config.api_server_url, "/apis/authentication.k8s.io/v1/selfsubjectreviews")
        # Create SelfSubjectReview request body
        review_request = {"apiVersion": "authentication.k8s.io/v1", "kind": "SelfSubjectReview"}
        verify = self._httpx_verify_value()
        try:
            async with httpx.AsyncClient(verify=verify, timeout=10.0) as client:
                response = await client.post(
                    review_api_url,
                    json=review_request,
                    headers={
                        "Authorization": f"Bearer {token}",
                        "Content-Type": "application/json",
                    },
                )
                if response.status_code == httpx.codes.UNAUTHORIZED:
                    raise TokenValidationError("Invalid token")
                if response.status_code != httpx.codes.CREATED:
                    logger.warning(f"Kubernetes SelfSubjectReview API failed with status code: {response.status_code}")
                    raise TokenValidationError(f"Token validation failed: {response.status_code}")
                review_response = response.json()
                # Extract user information from SelfSubjectReview response
                status = review_response.get("status", {})
                if not status:
                    raise ValueError("No status found in SelfSubjectReview response")
                user_info = status.get("userInfo", {})
                if not user_info:
                    raise ValueError("No userInfo found in SelfSubjectReview response")
                username = user_info.get("username")
                if not username:
                    raise ValueError("No username found in SelfSubjectReview response")
                # Build user attributes from Kubernetes user info
                user_attributes = get_attributes_from_claims(user_info, self.config.claims_mapping)
                return User(
                    principal=username,
                    attributes=user_attributes,
                )
        except httpx.TimeoutException:
            logger.warning("Kubernetes SelfSubjectReview API request timed out")
            raise ValueError("Token validation timeout") from None
        except Exception as e:
            logger.warning(f"Error during token validation: {str(e)}")
            raise ValueError(f"Token validation error: {str(e)}") from e
    async def close(self):
        """Close any resources."""
        pass
 def create_auth_provider(config: AuthenticationConfig) -> AuthProvider:
    """Factory function to create the appropriate auth provider."""
    provider_config = config.provider_config
@ -384,5 +469,7 @@ def create_auth_provider(config: AuthenticationConfig) -> AuthProvider:
        return OAuth2TokenAuthProvider(provider_config)
    elif isinstance(provider_config, GitHubTokenAuthConfig):
        return GitHubTokenAuthProvider(provider_config)
    elif isinstance(provider_config, KubernetesAuthProviderConfig):
        return KubernetesAuthProvider(provider_config)
    else:
        raise ValueError(f"Unknown authentication provider config type: {type(provider_config)}")
--- a/llama_stack/core/server/server.py
+++ b/llama_stack/core/server/server.py
@ -132,15 +132,17 @@ def translate_exception(exc: Exception) -> HTTPException | RequestValidationErro
            },
        )
    elif isinstance(exc, ConflictError):
-        return HTTPException(status_code=409, detail=str(exc))
+        return HTTPException(status_code=httpx.codes.CONFLICT, detail=str(exc))
    elif isinstance(exc, ResourceNotFoundError):
-        return HTTPException(status_code=404, detail=str(exc))
+        return HTTPException(status_code=httpx.codes.NOT_FOUND, detail=str(exc))
    elif isinstance(exc, ValueError):
        return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=f"Invalid value: {str(exc)}")
    elif isinstance(exc, BadRequestError):
        return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=str(exc))
    elif isinstance(exc, PermissionError | AccessDeniedError):
        return HTTPException(status_code=httpx.codes.FORBIDDEN, detail=f"Permission denied: {str(exc)}")
    elif isinstance(exc, ConnectionError | httpx.ConnectError):
        return HTTPException(status_code=httpx.codes.BAD_GATEWAY, detail=str(exc))
    elif isinstance(exc, asyncio.TimeoutError | TimeoutError):
        return HTTPException(status_code=httpx.codes.GATEWAY_TIMEOUT, detail=f"Operation timed out: {str(exc)}")
    elif isinstance(exc, NotImplementedError):
@ -513,6 +515,7 @@ def main(args: argparse.Namespace | None = None):
    apis_to_serve.add("inspect")
    apis_to_serve.add("providers")
    apis_to_serve.add("prompts")
    for api_str in apis_to_serve:
        api = Api(api_str)
--- a/llama_stack/core/stack.py
+++ b/llama_stack/core/stack.py
@ -24,6 +24,7 @@ from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
 from llama_stack.apis.post_training import PostTraining
 from llama_stack.apis.prompts import Prompts
 from llama_stack.apis.providers import Providers
 from llama_stack.apis.safety import Safety
 from llama_stack.apis.scoring import Scoring
@ -37,6 +38,7 @@ from llama_stack.apis.vector_io import VectorIO
 from llama_stack.core.datatypes import Provider, StackRunConfig
 from llama_stack.core.distribution import get_provider_registry
 from llama_stack.core.inspect import DistributionInspectConfig, DistributionInspectImpl
 from llama_stack.core.prompts.prompts import PromptServiceConfig, PromptServiceImpl
 from llama_stack.core.providers import ProviderImpl, ProviderImplConfig
 from llama_stack.core.resolver import ProviderRegistry, resolve_impls
 from llama_stack.core.routing_tables.common import CommonRoutingTableImpl
@ -72,6 +74,7 @@ class LlamaStack(
    ToolRuntime,
    RAGToolRuntime,
    Files,
    Prompts,
 ):
    pass
@ -305,6 +308,12 @@ def add_internal_implementations(impls: dict[Api, Any], run_config: StackRunConf
    )
    impls[Api.providers] = providers_impl
    prompts_impl = PromptServiceImpl(
        PromptServiceConfig(run_config=run_config),
        deps=impls,
    )
    impls[Api.prompts] = prompts_impl
 # Produces a stack of providers for the given run config. Not all APIs may be
 # asked for in the run config.
@ -329,6 +338,9 @@ async def construct_stack(
    # Add internal implementations after all other providers are resolved
    add_internal_implementations(impls, run_config)
    if Api.prompts in impls:
        await impls[Api.prompts].initialize()
    await register_resources(run_config, impls)
    await refresh_registry_once(impls)
--- a/llama_stack/distributions/ci-tests/ci_tests.py
+++ b/llama_stack/distributions/ci-tests/ci_tests.py
@ -11,9 +11,7 @@ from ..starter.starter import get_distribution_template as get_starter_distribut
 def get_distribution_template() -> DistributionTemplate:
-    template = get_starter_distribution_template()
+    template = get_starter_distribution_template(name="ci-tests")
    name = "ci-tests"
    template.name = name
    template.description = "CI tests for Llama Stack"
    return template
--- a/llama_stack/distributions/ci-tests/run.yaml
+++ b/llama_stack/distributions/ci-tests/run.yaml
@ -89,28 +89,28 @@ providers:
    config:
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/faiss_store.db
  - provider_id: sqlite-vec
    provider_type: inline::sqlite-vec
    config:
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db
+      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/sqlite_vec.db
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/sqlite_vec_registry.db
  - provider_id: ${env.MILVUS_URL:+milvus}
    provider_type: inline::milvus
    config:
-      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db
+      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/ci-tests}/milvus.db
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/milvus_registry.db
  - provider_id: ${env.CHROMADB_URL:+chromadb}
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter/}/chroma_remote_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests/}/chroma_remote_registry.db
  - provider_id: ${env.PGVECTOR_DB:+pgvector}
    provider_type: remote::pgvector
    config:
@ -121,15 +121,15 @@ providers:
      password: ${env.PGVECTOR_PASSWORD:=}
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/pgvector_registry.db
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/ci-tests/files}
      metadata_store:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/files_metadata.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/llama_stack/distributions/starter-gpu/run.yaml
+++ b/llama_stack/distributions/starter-gpu/run.yaml
@ -89,28 +89,28 @@ providers:
    config:
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/faiss_store.db
  - provider_id: sqlite-vec
    provider_type: inline::sqlite-vec
    config:
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db
+      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/sqlite_vec.db
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/sqlite_vec_registry.db
  - provider_id: ${env.MILVUS_URL:+milvus}
    provider_type: inline::milvus
    config:
-      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db
+      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter-gpu}/milvus.db
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/milvus_registry.db
  - provider_id: ${env.CHROMADB_URL:+chromadb}
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter/}/chroma_remote_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu/}/chroma_remote_registry.db
  - provider_id: ${env.PGVECTOR_DB:+pgvector}
    provider_type: remote::pgvector
    config:
@ -121,15 +121,15 @@ providers:
      password: ${env.PGVECTOR_PASSWORD:=}
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/pgvector_registry.db
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter-gpu/files}
      metadata_store:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/files_metadata.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/llama_stack/distributions/starter-gpu/starter_gpu.py
+++ b/llama_stack/distributions/starter-gpu/starter_gpu.py
@ -11,9 +11,7 @@ from ..starter.starter import get_distribution_template as get_starter_distribut
 def get_distribution_template() -> DistributionTemplate:
-    template = get_starter_distribution_template()
+    template = get_starter_distribution_template(name="starter-gpu")
    name = "starter-gpu"
    template.name = name
    template.description = "Quick start template for running Llama Stack with several popular providers. This distribution is intended for GPU-enabled environments."
    template.providers["post_training"] = [
--- a/llama_stack/distributions/starter/starter.py
+++ b/llama_stack/distributions/starter/starter.py
@ -99,9 +99,8 @@ def get_remote_inference_providers() -> list[Provider]:
    return inference_providers
-def get_distribution_template() -> DistributionTemplate:
+def get_distribution_template(name: str = "starter") -> DistributionTemplate:
    remote_inference_providers = get_remote_inference_providers()
    name = "starter"
    providers = {
        "inference": [BuildProvider(provider_type=p.provider_type, module=p.module) for p in remote_inference_providers]
--- a/llama_stack/providers/inline/batches/reference/batches.py
+++ b/llama_stack/providers/inline/batches/reference/batches.py
@ -178,9 +178,9 @@ class ReferenceBatchesImpl(Batches):
        # TODO: set expiration time for garbage collection
-        if endpoint not in ["/v1/chat/completions"]:
+        if endpoint not in ["/v1/chat/completions", "/v1/completions"]:
            raise ValueError(
-                f"Invalid endpoint: {endpoint}. Supported values: /v1/chat/completions. Code: invalid_value. Param: endpoint",
+                f"Invalid endpoint: {endpoint}. Supported values: /v1/chat/completions, /v1/completions. Code: invalid_value. Param: endpoint",
            )
        if completion_window != "24h":
@ -424,13 +424,21 @@ class ReferenceBatchesImpl(Batches):
                            )
                            valid = False
-                        for param, expected_type, type_string in [
+                        if batch.endpoint == "/v1/chat/completions":
                            required_params = [
                                ("model", str, "a string"),
                                # messages is specific to /v1/chat/completions
                                # we could skip validating messages here and let inference fail. however,
                                # that would be a very expensive way to find out messages is wrong.
                                ("messages", list, "an array"),  # TODO: allow messages to be a string?
-                        ]:
+                            ]
                        else:  # /v1/completions
                            required_params = [
                                ("model", str, "a string"),
                                ("prompt", str, "a string"),  # TODO: allow prompt to be a list of strings??
                            ]
                        for param, expected_type, type_string in required_params:
                            if param not in body:
                                errors.append(
                                    BatchError(
@ -591,6 +599,7 @@ class ReferenceBatchesImpl(Batches):
        try:
            # TODO(SECURITY): review body for security issues
            if request.url == "/v1/chat/completions":
                request.body["messages"] = [convert_to_openai_message_param(msg) for msg in request.body["messages"]]
                chat_response = await self.inference_api.openai_chat_completion(**request.body)
@ -605,6 +614,22 @@ class ReferenceBatchesImpl(Batches):
                        "body": chat_response.model_dump_json(),
                    },
                }
            else:  # /v1/completions
                completion_response = await self.inference_api.openai_completion(**request.body)
                # this is for mypy, we don't allow streaming so we'll get the right type
                assert hasattr(completion_response, "model_dump_json"), (
                    "Completion response must have model_dump_json method"
                )
                return {
                    "id": request_id,
                    "custom_id": request.custom_id,
                    "response": {
                        "status_code": 200,
                        "request_id": request_id,
                        "body": completion_response.model_dump_json(),
                    },
                }
        except Exception as e:
            logger.info(f"Error processing request {request.custom_id} in batch {batch_id}: {e}")
            return {
--- a/llama_stack/providers/inline/tool_runtime/rag/init.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/init.py
@ -14,6 +14,6 @@ from .config import RagToolRuntimeConfig
 async def get_provider_impl(config: RagToolRuntimeConfig, deps: dict[Api, Any]):
    from .memory import MemoryToolRuntimeImpl
-    impl = MemoryToolRuntimeImpl(config, deps[Api.vector_io], deps[Api.inference])
+    impl = MemoryToolRuntimeImpl(config, deps[Api.vector_io], deps[Api.inference], deps[Api.files])
    await impl.initialize()
    return impl
--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@ -5,10 +5,15 @@
 # the root directory of this source tree.
 import asyncio
 import base64
 import io
 import mimetypes
 import secrets
 import string
 from typing import Any
 import httpx
 from fastapi import UploadFile
 from pydantic import TypeAdapter
 from llama_stack.apis.common.content_types import (
@ -17,6 +22,7 @@ from llama_stack.apis.common.content_types import (
    InterleavedContentItem,
    TextContentItem,
 )
 from llama_stack.apis.files import Files, OpenAIFilePurpose
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.tools import (
    ListToolDefsResponse,
@ -30,13 +36,18 @@ from llama_stack.apis.tools import (
    ToolParameter,
    ToolRuntime,
 )
-from llama_stack.apis.vector_io import QueryChunksResponse, VectorIO
+from llama_stack.apis.vector_io import (
    QueryChunksResponse,
    VectorIO,
    VectorStoreChunkingStrategyStatic,
    VectorStoreChunkingStrategyStaticConfig,
 )
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
 from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
 from llama_stack.providers.utils.memory.vector_store import (
    content_from_doc,
-    make_overlapped_chunks,
+    parse_data_url,
 )
 from .config import RagToolRuntimeConfig
@ -55,10 +66,12 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
        config: RagToolRuntimeConfig,
        vector_io_api: VectorIO,
        inference_api: Inference,
        files_api: Files,
    ):
        self.config = config
        self.vector_io_api = vector_io_api
        self.inference_api = inference_api
        self.files_api = files_api
    async def initialize(self):
        pass
@ -78,26 +91,49 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
        vector_db_id: str,
        chunk_size_in_tokens: int = 512,
    ) -> None:
-        chunks = []
+        if not documents:
        for doc in documents:
            content = await content_from_doc(doc)
            # TODO: we should add enrichment here as URLs won't be added to the metadata by default
            chunks.extend(
                make_overlapped_chunks(
                    doc.document_id,
                    content,
                    chunk_size_in_tokens,
                    chunk_size_in_tokens // 4,
                    doc.metadata,
                )
            )
        if not chunks:
            return
-        await self.vector_io_api.insert_chunks(
+        for doc in documents:
-            chunks=chunks,
+            if isinstance(doc.content, URL):
-            vector_db_id=vector_db_id,
+                if doc.content.uri.startswith("data:"):
                    parts = parse_data_url(doc.content.uri)
                    file_data = base64.b64decode(parts["data"]) if parts["is_base64"] else parts["data"].encode()
                    mime_type = parts["mimetype"]
                else:
                    async with httpx.AsyncClient() as client:
                        response = await client.get(doc.content.uri)
                        file_data = response.content
                        mime_type = doc.mime_type or response.headers.get("content-type", "application/octet-stream")
            else:
                content_str = await content_from_doc(doc)
                file_data = content_str.encode("utf-8")
                mime_type = doc.mime_type or "text/plain"
            file_extension = mimetypes.guess_extension(mime_type) or ".txt"
            filename = doc.metadata.get("filename", f"{doc.document_id}{file_extension}")
            file_obj = io.BytesIO(file_data)
            file_obj.name = filename
            upload_file = UploadFile(file=file_obj, filename=filename)
            created_file = await self.files_api.openai_upload_file(
                file=upload_file, purpose=OpenAIFilePurpose.ASSISTANTS
            )
            chunking_strategy = VectorStoreChunkingStrategyStatic(
                static=VectorStoreChunkingStrategyStaticConfig(
                    max_chunk_size_tokens=chunk_size_in_tokens,
                    chunk_overlap_tokens=chunk_size_in_tokens // 4,
                )
            )
            await self.vector_io_api.openai_attach_file_to_vector_store(
                vector_store_id=vector_db_id,
                file_id=created_file.id,
                attributes=doc.metadata,
                chunking_strategy=chunking_strategy,
            )
    async def query(
@ -131,8 +167,18 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
            for vector_db_id in vector_db_ids
        ]
        results: list[QueryChunksResponse] = await asyncio.gather(*tasks)
-        chunks = [c for r in results for c in r.chunks]
+
-        scores = [s for r in results for s in r.scores]
+        chunks = []
        scores = []
        for vector_db_id, result in zip(vector_db_ids, results, strict=False):
            for chunk, score in zip(result.chunks, result.scores, strict=False):
                if not hasattr(chunk, "metadata") or chunk.metadata is None:
                    chunk.metadata = {}
                chunk.metadata["vector_db_id"] = vector_db_id
                chunks.append(chunk)
                scores.append(score)
        if not chunks:
            return RAGQueryResult(content=None)
@ -167,6 +213,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
            metadata_keys_to_exclude_from_context = [
                "token_count",
                "metadata_token_count",
                "vector_db_id",
            ]
            metadata_for_context = {}
            for k in chunk_metadata_keys_to_include_from_context:
@ -191,6 +238,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
                "document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]],
                "chunks": [c.content for c in chunks[: len(picked)]],
                "scores": scores[: len(picked)],
                "vector_db_ids": [c.metadata["vector_db_id"] for c in chunks[: len(picked)]],
            },
        )
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
@ -30,11 +30,11 @@ from llama_stack.providers.utils.kvstore.api import KVStore
 from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
 from llama_stack.providers.utils.memory.vector_store import (
    RERANKER_TYPE_RRF,
    RERANKER_TYPE_WEIGHTED,
    ChunkForDeletion,
    EmbeddingIndex,
    VectorDBWithIndex,
 )
 from llama_stack.providers.utils.vector_io.vector_utils import WeightedInMemoryAggregator
 logger = get_logger(name=__name__, category="vector_io")
@ -66,59 +66,6 @@ def _create_sqlite_connection(db_path):
    return connection
 def _normalize_scores(scores: dict[str, float]) -> dict[str, float]:
    """Normalize scores to [0,1] range using min-max normalization."""
    if not scores:
        return {}
    min_score = min(scores.values())
    max_score = max(scores.values())
    score_range = max_score - min_score
    if score_range > 0:
        return {doc_id: (score - min_score) / score_range for doc_id, score in scores.items()}
    return dict.fromkeys(scores, 1.0)
 def _weighted_rerank(
    vector_scores: dict[str, float],
    keyword_scores: dict[str, float],
    alpha: float = 0.5,
 ) -> dict[str, float]:
    """ReRanker that uses weighted average of scores."""
    all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
    normalized_vector_scores = _normalize_scores(vector_scores)
    normalized_keyword_scores = _normalize_scores(keyword_scores)
    return {
        doc_id: (alpha * normalized_keyword_scores.get(doc_id, 0.0))
        + ((1 - alpha) * normalized_vector_scores.get(doc_id, 0.0))
        for doc_id in all_ids
    }
 def _rrf_rerank(
    vector_scores: dict[str, float],
    keyword_scores: dict[str, float],
    impact_factor: float = 60.0,
 ) -> dict[str, float]:
    """ReRanker that uses Reciprocal Rank Fusion."""
    # Convert scores to ranks
    vector_ranks = {
        doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(vector_scores.items(), key=lambda x: x[1], reverse=True))
    }
    keyword_ranks = {
        doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True))
    }
    all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
    rrf_scores = {}
    for doc_id in all_ids:
        vector_rank = vector_ranks.get(doc_id, float("inf"))
        keyword_rank = keyword_ranks.get(doc_id, float("inf"))
        # RRF formula: score = 1/(k + r) where k is impact_factor and r is the rank
        rrf_scores[doc_id] = (1.0 / (impact_factor + vector_rank)) + (1.0 / (impact_factor + keyword_rank))
    return rrf_scores
 def _make_sql_identifier(name: str) -> str:
    return re.sub(r"[^a-zA-Z0-9_]", "_", name)
@ -398,14 +345,10 @@ class SQLiteVecIndex(EmbeddingIndex):
            for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False)
        }
-        # Combine scores using the specified reranker
+        # Combine scores using the reranking utility
-        if reranker_type == RERANKER_TYPE_WEIGHTED:
+        combined_scores = WeightedInMemoryAggregator.combine_search_results(
-            alpha = reranker_params.get("alpha", 0.5)
+            vector_scores, keyword_scores, reranker_type, reranker_params
-            combined_scores = _weighted_rerank(vector_scores, keyword_scores, alpha)
+        )
        else:
            # Default to RRF for None, RRF, or any unknown types
            impact_factor = reranker_params.get("impact_factor", 60.0)
            combined_scores = _rrf_rerank(vector_scores, keyword_scores, impact_factor)
        # Sort by combined score and get top k results
        sorted_items = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
--- a/llama_stack/providers/registry/batches.py
+++ b/llama_stack/providers/registry/batches.py
@ -13,7 +13,7 @@ def available_providers() -> list[ProviderSpec]:
        InlineProviderSpec(
            api=Api.batches,
            provider_type="inline::reference",
-            pip_packages=["openai"],
+            pip_packages=[],
            module="llama_stack.providers.inline.batches.reference",
            config_class="llama_stack.providers.inline.batches.reference.config.ReferenceBatchesImplConfig",
            api_dependencies=[
--- a/llama_stack/providers/registry/datasetio.py
+++ b/llama_stack/providers/registry/datasetio.py
@ -30,7 +30,7 @@ def available_providers() -> list[ProviderSpec]:
            adapter=AdapterSpec(
                adapter_type="huggingface",
                pip_packages=[
-                    "datasets",
+                    "datasets>=4.0.0",
                ],
                module="llama_stack.providers.remote.datasetio.huggingface",
                config_class="llama_stack.providers.remote.datasetio.huggingface.HuggingfaceDatasetIOConfig",
@ -42,7 +42,7 @@ def available_providers() -> list[ProviderSpec]:
            adapter=AdapterSpec(
                adapter_type="nvidia",
                pip_packages=[
-                    "datasets",
+                    "datasets>=4.0.0",
                ],
                module="llama_stack.providers.remote.datasetio.nvidia",
                config_class="llama_stack.providers.remote.datasetio.nvidia.NvidiaDatasetIOConfig",
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -75,7 +75,7 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="vllm",
-                pip_packages=["openai"],
+                pip_packages=[],
                module="llama_stack.providers.remote.inference.vllm",
                config_class="llama_stack.providers.remote.inference.vllm.VLLMInferenceAdapterConfig",
                description="Remote vLLM inference provider for connecting to vLLM servers.",
@ -116,7 +116,7 @@ def available_providers() -> list[ProviderSpec]:
            adapter=AdapterSpec(
                adapter_type="fireworks",
                pip_packages=[
-                    "fireworks-ai<=0.18.0",
+                    "fireworks-ai<=0.17.16",
                ],
                module="llama_stack.providers.remote.inference.fireworks",
                config_class="llama_stack.providers.remote.inference.fireworks.FireworksImplConfig",
@ -151,9 +151,7 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="databricks",
-                pip_packages=[
+                pip_packages=[],
                    "openai",
                ],
                module="llama_stack.providers.remote.inference.databricks",
                config_class="llama_stack.providers.remote.inference.databricks.DatabricksImplConfig",
                description="Databricks inference provider for running models on Databricks' unified analytics platform.",
@ -163,9 +161,7 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="nvidia",
-                pip_packages=[
+                pip_packages=[],
                    "openai",
                ],
                module="llama_stack.providers.remote.inference.nvidia",
                config_class="llama_stack.providers.remote.inference.nvidia.NVIDIAConfig",
                description="NVIDIA inference provider for accessing NVIDIA NIM models and AI services.",
@ -175,7 +171,7 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="runpod",
-                pip_packages=["openai"],
+                pip_packages=[],
                module="llama_stack.providers.remote.inference.runpod",
                config_class="llama_stack.providers.remote.inference.runpod.RunpodImplConfig",
                description="RunPod inference provider for running models on RunPod's cloud GPU platform.",
@ -292,7 +288,7 @@ Available Models:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="watsonx",
-                pip_packages=["ibm_watson_machine_learning"],
+                pip_packages=["ibm_watsonx_ai"],
                module="llama_stack.providers.remote.inference.watsonx",
                config_class="llama_stack.providers.remote.inference.watsonx.WatsonXConfig",
                provider_data_validator="llama_stack.providers.remote.inference.watsonx.WatsonXProviderDataValidator",
--- a/llama_stack/providers/registry/post_training.py
+++ b/llama_stack/providers/registry/post_training.py
@ -48,7 +48,7 @@ def available_providers() -> list[ProviderSpec]:
        InlineProviderSpec(
            api=Api.post_training,
            provider_type="inline::huggingface-gpu",
-            pip_packages=["trl", "transformers", "peft", "datasets", "torch"],
+            pip_packages=["trl", "transformers", "peft", "datasets>=4.0.0", "torch"],
            module="llama_stack.providers.inline.post_training.huggingface",
            config_class="llama_stack.providers.inline.post_training.huggingface.HuggingFacePostTrainingConfig",
            api_dependencies=[
--- a/llama_stack/providers/registry/scoring.py
+++ b/llama_stack/providers/registry/scoring.py
@ -38,7 +38,7 @@ def available_providers() -> list[ProviderSpec]:
        InlineProviderSpec(
            api=Api.scoring,
            provider_type="inline::braintrust",
-            pip_packages=["autoevals", "openai"],
+            pip_packages=["autoevals"],
            module="llama_stack.providers.inline.scoring.braintrust",
            config_class="llama_stack.providers.inline.scoring.braintrust.BraintrustScoringConfig",
            api_dependencies=[
--- a/llama_stack/providers/registry/tool_runtime.py
+++ b/llama_stack/providers/registry/tool_runtime.py
@ -32,7 +32,7 @@ def available_providers() -> list[ProviderSpec]:
            ],
            module="llama_stack.providers.inline.tool_runtime.rag",
            config_class="llama_stack.providers.inline.tool_runtime.rag.config.RagToolRuntimeConfig",
-            api_dependencies=[Api.vector_io, Api.inference],
+            api_dependencies=[Api.vector_io, Api.inference, Api.files],
            description="RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search.",
        ),
        remote_provider_spec(
--- a/llama_stack/providers/remote/inference/anthropic/anthropic.py
+++ b/llama_stack/providers/remote/inference/anthropic/anthropic.py
@ -5,12 +5,13 @@
 # the root directory of this source tree.
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from .config import AnthropicConfig
 from .models import MODEL_ENTRIES
-class AnthropicInferenceAdapter(LiteLLMOpenAIMixin):
+class AnthropicInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
    def __init__(self, config: AnthropicConfig) -> None:
        LiteLLMOpenAIMixin.__init__(
            self,
@ -26,3 +27,8 @@ class AnthropicInferenceAdapter(LiteLLMOpenAIMixin):
    async def shutdown(self) -> None:
        await super().shutdown()
    get_api_key = LiteLLMOpenAIMixin.get_api_key
    def get_base_url(self):
        return "https://api.anthropic.com/v1"
--- a/llama_stack/providers/remote/inference/gemini/gemini.py
+++ b/llama_stack/providers/remote/inference/gemini/gemini.py
@ -5,12 +5,13 @@
 # the root directory of this source tree.
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from .config import GeminiConfig
 from .models import MODEL_ENTRIES
-class GeminiInferenceAdapter(LiteLLMOpenAIMixin):
+class GeminiInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
    def __init__(self, config: GeminiConfig) -> None:
        LiteLLMOpenAIMixin.__init__(
            self,
@ -21,6 +22,11 @@ class GeminiInferenceAdapter(LiteLLMOpenAIMixin):
        )
        self.config = config
    get_api_key = LiteLLMOpenAIMixin.get_api_key
    def get_base_url(self):
        return "https://generativelanguage.googleapis.com/v1beta/openai/"
    async def initialize(self) -> None:
        await super().initialize()
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@ -4,30 +4,15 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from collections.abc import AsyncIterator
 from typing import Any
 from openai import AsyncOpenAI
 from llama_stack.apis.inference import (
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
    OpenAIChoiceDelta,
    OpenAIChunkChoice,
    OpenAIMessageParam,
    OpenAIResponseFormatParam,
    OpenAISystemMessageParam,
 )
 from llama_stack.providers.remote.inference.groq.config import GroqConfig
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
-from llama_stack.providers.utils.inference.openai_compat import (
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
    prepare_openai_completion_params,
 )
 from .models import MODEL_ENTRIES
-class GroqInferenceAdapter(LiteLLMOpenAIMixin):
+class GroqInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
    _config: GroqConfig
    def __init__(self, config: GroqConfig):
@ -40,122 +25,14 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin):
        )
        self.config = config
    # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
    get_api_key = LiteLLMOpenAIMixin.get_api_key
    def get_base_url(self) -> str:
        return f"{self.config.url}/openai/v1"
    async def initialize(self):
        await super().initialize()
    async def shutdown(self):
        await super().shutdown()
    def _get_openai_client(self) -> AsyncOpenAI:
        return AsyncOpenAI(
            base_url=f"{self.config.url}/openai/v1",
            api_key=self.get_api_key(),
        )
    async def openai_chat_completion(
        self,
        model: str,
        messages: list[OpenAIMessageParam],
        frequency_penalty: float | None = None,
        function_call: str | dict[str, Any] | None = None,
        functions: list[dict[str, Any]] | None = None,
        logit_bias: dict[str, float] | None = None,
        logprobs: bool | None = None,
        max_completion_tokens: int | None = None,
        max_tokens: int | None = None,
        n: int | None = None,
        parallel_tool_calls: bool | None = None,
        presence_penalty: float | None = None,
        response_format: OpenAIResponseFormatParam | None = None,
        seed: int | None = None,
        stop: str | list[str] | None = None,
        stream: bool | None = None,
        stream_options: dict[str, Any] | None = None,
        temperature: float | None = None,
        tool_choice: str | dict[str, Any] | None = None,
        tools: list[dict[str, Any]] | None = None,
        top_logprobs: int | None = None,
        top_p: float | None = None,
        user: str | None = None,
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
        model_obj = await self.model_store.get_model(model)
        # Groq does not support json_schema response format, so we need to convert it to json_object
        if response_format and response_format.type == "json_schema":
            response_format.type = "json_object"
            schema = response_format.json_schema.get("schema", {})
            response_format.json_schema = None
            json_instructions = f"\nYour response should be a JSON object that matches the following schema: {schema}"
            if messages and messages[0].role == "system":
                messages[0].content = messages[0].content + json_instructions
            else:
                messages.insert(0, OpenAISystemMessageParam(content=json_instructions))
        # Groq returns a 400 error if tools are provided but none are called
        # So, set tool_choice to "required" to attempt to force a call
        if tools and (not tool_choice or tool_choice == "auto"):
            tool_choice = "required"
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
            messages=messages,
            frequency_penalty=frequency_penalty,
            function_call=function_call,
            functions=functions,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_completion_tokens=max_completion_tokens,
            max_tokens=max_tokens,
            n=n,
            parallel_tool_calls=parallel_tool_calls,
            presence_penalty=presence_penalty,
            response_format=response_format,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            tool_choice=tool_choice,
            tools=tools,
            top_logprobs=top_logprobs,
            top_p=top_p,
            user=user,
        )
        # Groq does not support streaming requests that set response_format
        fake_stream = False
        if stream and response_format:
            params["stream"] = False
            fake_stream = True
        response = await self._get_openai_client().chat.completions.create(**params)
        if fake_stream:
            chunk_choices = []
            for choice in response.choices:
                delta = OpenAIChoiceDelta(
                    content=choice.message.content,
                    role=choice.message.role,
                    tool_calls=choice.message.tool_calls,
                )
                chunk_choice = OpenAIChunkChoice(
                    delta=delta,
                    finish_reason=choice.finish_reason,
                    index=choice.index,
                    logprobs=None,
                )
                chunk_choices.append(chunk_choice)
            chunk = OpenAIChatCompletionChunk(
                id=response.id,
                choices=chunk_choices,
                object="chat.completion.chunk",
                created=response.created,
                model=response.model,
            )
            async def _fake_stream_generator():
                yield chunk
            return _fake_stream_generator()
        else:
            return response
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -118,10 +118,10 @@ class OllamaInferenceAdapter(
    async def initialize(self) -> None:
        logger.info(f"checking connectivity to Ollama at `{self.config.url}`...")
-        health_response = await self.health()
+        r = await self.health()
-        if health_response["status"] == HealthStatus.ERROR:
+        if r["status"] == HealthStatus.ERROR:
            logger.warning(
-                "Ollama Server is not running, make sure to start it using `ollama serve` in a separate terminal"
+                f"Ollama Server is not running (message: {r['message']}). Make sure to start it using `ollama serve` in a separate terminal"
            )
    async def should_refresh_models(self) -> bool:
@ -156,7 +156,7 @@ class OllamaInferenceAdapter(
            ),
            Model(
                identifier="nomic-embed-text",
-                provider_resource_id="nomic-embed-text",
+                provider_resource_id="nomic-embed-text:latest",
                provider_id=provider_id,
                metadata={
                    "embedding_dimension": 768,
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@ -4,13 +4,26 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from .config import SambaNovaImplConfig
 from .models import MODEL_ENTRIES
-class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
+class SambaNovaInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
    """
    SambaNova Inference Adapter for Llama Stack.
    Note: The inheritance order is important here. OpenAIMixin must come before
    LiteLLMOpenAIMixin to ensure that OpenAIMixin.check_model_availability()
    is used instead of LiteLLMOpenAIMixin.check_model_availability().
    - OpenAIMixin.check_model_availability() queries the /v1/models to check if a model exists
    - LiteLLMOpenAIMixin.check_model_availability() checks the static registry within LiteLLM
    """
    def __init__(self, config: SambaNovaImplConfig):
        self.config = config
        self.environment_available_models = []
@ -24,3 +37,14 @@ class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
            download_images=True,  # SambaNova requires base64 image encoding
            json_schema_strict=False,  # SambaNova doesn't support strict=True yet
        )
    # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
    get_api_key = LiteLLMOpenAIMixin.get_api_key
    def get_base_url(self) -> str:
        """
        Get the base URL for OpenAI mixin.
        :return: The SambaNova base URL
        """
        return self.config.url
--- a/llama_stack/providers/remote/inference/vertexai/vertexai.py
+++ b/llama_stack/providers/remote/inference/vertexai/vertexai.py
@ -6,16 +6,20 @@
 from typing import Any
 import google.auth.transport.requests
 from google.auth import default
 from llama_stack.apis.inference import ChatCompletionRequest
 from llama_stack.providers.utils.inference.litellm_openai_mixin import (
    LiteLLMOpenAIMixin,
 )
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from .config import VertexAIConfig
 from .models import MODEL_ENTRIES
-class VertexAIInferenceAdapter(LiteLLMOpenAIMixin):
+class VertexAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
    def __init__(self, config: VertexAIConfig) -> None:
        LiteLLMOpenAIMixin.__init__(
            self,
@ -27,10 +31,31 @@ class VertexAIInferenceAdapter(LiteLLMOpenAIMixin):
        self.config = config
    def get_api_key(self) -> str:
-        # Vertex AI doesn't use API keys, it uses Application Default Credentials
+        """
-        # Return empty string to let litellm handle authentication via ADC
+        Get an access token for Vertex AI using Application Default Credentials.
        Vertex AI uses ADC instead of API keys. This method obtains an access token
        from the default credentials and returns it for use with the OpenAI-compatible client.
        """
        try:
            # Get default credentials - will read from GOOGLE_APPLICATION_CREDENTIALS
            credentials, _ = default(scopes=["https://www.googleapis.com/auth/cloud-platform"])
            credentials.refresh(google.auth.transport.requests.Request())
            return str(credentials.token)
        except Exception:
            # If we can't get credentials, return empty string to let LiteLLM handle it
            # This allows the LiteLLM mixin to work with ADC directly
            return ""
    def get_base_url(self) -> str:
        """
        Get the Vertex AI OpenAI-compatible API base URL.
        Returns the Vertex AI OpenAI-compatible endpoint URL.
        Source: https://cloud.google.com/vertex-ai/generative-ai/docs/start/openai
        """
        return f"https://{self.config.location}-aiplatform.googleapis.com/v1/projects/{self.config.project}/locations/{self.config.location}/endpoints/openapi"
    async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]:
        # Get base parameters from parent
        params = await super()._get_params(request)
--- a/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py
@ -7,8 +7,8 @@
 from collections.abc import AsyncGenerator, AsyncIterator
 from typing import Any
-from ibm_watson_machine_learning.foundation_models import Model
+from ibm_watsonx_ai.foundation_models import Model
-from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
+from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
 from openai import AsyncOpenAI
 from llama_stack.apis.common.content_types import InterleavedContent, InterleavedContentItem
--- a/llama_stack/providers/utils/bedrock/config.py
+++ b/llama_stack/providers/utils/bedrock/config.py
@ -4,53 +4,55 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import os
 from pydantic import BaseModel, Field
 class BedrockBaseConfig(BaseModel):
    aws_access_key_id: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_ACCESS_KEY_ID"),
        description="The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID",
    )
    aws_secret_access_key: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_SECRET_ACCESS_KEY"),
        description="The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY",
    )
    aws_session_token: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_SESSION_TOKEN"),
        description="The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN",
    )
    region_name: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_DEFAULT_REGION"),
        description="The default AWS Region to use, for example, us-west-1 or us-west-2."
        "Default use environment variable: AWS_DEFAULT_REGION",
    )
    profile_name: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_PROFILE"),
        description="The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE",
    )
    total_max_attempts: int | None = Field(
-        default=None,
+        default_factory=lambda: int(val) if (val := os.getenv("AWS_MAX_ATTEMPTS")) else None,
        description="An integer representing the maximum number of attempts that will be made for a single request, "
        "including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS",
    )
    retry_mode: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_RETRY_MODE"),
        description="A string representing the type of retries Boto3 will perform."
        "Default use environment variable: AWS_RETRY_MODE",
    )
    connect_timeout: float | None = Field(
-        default=60,
+        default_factory=lambda: float(os.getenv("AWS_CONNECT_TIMEOUT", "60")),
        description="The time in seconds till a timeout exception is thrown when attempting to make a connection. "
        "The default is 60 seconds.",
    )
    read_timeout: float | None = Field(
-        default=60,
+        default_factory=lambda: float(os.getenv("AWS_READ_TIMEOUT", "60")),
        description="The time in seconds till a timeout exception is thrown when attempting to read from a connection."
        "The default is 60 seconds.",
    )
    session_ttl: int | None = Field(
-        default=3600,
+        default_factory=lambda: int(os.getenv("AWS_SESSION_TTL", "3600")),
        description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).",
    )
--- a/llama_stack/providers/utils/inference/embedding_mixin.py
+++ b/llama_stack/providers/utils/inference/embedding_mixin.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import asyncio
 import base64
 import struct
 from typing import TYPE_CHECKING
@ -43,9 +44,11 @@ class SentenceTransformerEmbeddingMixin:
        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        model = await self.model_store.get_model(model_id)
-        embedding_model = self._load_sentence_transformer_model(model.provider_resource_id)
+        embedding_model = await self._load_sentence_transformer_model(model.provider_resource_id)
-        embeddings = embedding_model.encode(
+        embeddings = await asyncio.to_thread(
-            [interleaved_content_as_str(content) for content in contents], show_progress_bar=False
+            embedding_model.encode,
            [interleaved_content_as_str(content) for content in contents],
            show_progress_bar=False,
        )
        return EmbeddingsResponse(embeddings=embeddings)
@ -64,8 +67,8 @@ class SentenceTransformerEmbeddingMixin:
        # Get the model and generate embeddings
        model_obj = await self.model_store.get_model(model)
-        embedding_model = self._load_sentence_transformer_model(model_obj.provider_resource_id)
+        embedding_model = await self._load_sentence_transformer_model(model_obj.provider_resource_id)
-        embeddings = embedding_model.encode(input_list, show_progress_bar=False)
+        embeddings = await asyncio.to_thread(embedding_model.encode, input_list, show_progress_bar=False)
        # Convert embeddings to the requested format
        data = []
@ -93,7 +96,7 @@ class SentenceTransformerEmbeddingMixin:
            usage=usage,
        )
-    def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer":
+    async def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer":
        global EMBEDDING_MODELS
        loaded_model = EMBEDDING_MODELS.get(model)
@ -101,8 +104,12 @@ class SentenceTransformerEmbeddingMixin:
            return loaded_model
        log.info(f"Loading sentence transformer for {model}...")
        def _load_model():
            from sentence_transformers import SentenceTransformer
-        loaded_model = SentenceTransformer(model)
+            return SentenceTransformer(model)
        loaded_model = await asyncio.to_thread(_load_model)
        EMBEDDING_MODELS[model] = loaded_model
        return loaded_model
--- a/llama_stack/providers/utils/inference/inference_store.py
+++ b/llama_stack/providers/utils/inference/inference_store.py
@ -3,6 +3,11 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import asyncio
 from typing import Any
 from sqlalchemy.exc import IntegrityError
 from llama_stack.apis.inference import (
    ListOpenAIChatCompletionResponse,
    OpenAIChatCompletion,
@ -10,24 +15,43 @@ from llama_stack.apis.inference import (
    OpenAIMessageParam,
    Order,
 )
-from llama_stack.core.datatypes import AccessRule
+from llama_stack.core.datatypes import AccessRule, InferenceStoreConfig
-from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR
+from llama_stack.log import get_logger
 from ..sqlstore.api import ColumnDefinition, ColumnType
 from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
-from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl
+from ..sqlstore.sqlstore import SqlStoreConfig, SqlStoreType, sqlstore_impl
 logger = get_logger(name=__name__, category="inference_store")
 class InferenceStore:
-    def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]):
+    def __init__(
-        if not sql_store_config:
+        self,
-            sql_store_config = SqliteSqlStoreConfig(
+        config: InferenceStoreConfig | SqlStoreConfig,
-                db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
+        policy: list[AccessRule],
    ):
        # Handle backward compatibility
        if not isinstance(config, InferenceStoreConfig):
            # Legacy: SqlStoreConfig passed directly as config
            config = InferenceStoreConfig(
                sql_store_config=config,
            )
-        self.sql_store_config = sql_store_config
+
        self.config = config
        self.sql_store_config = config.sql_store_config
        self.sql_store = None
        self.policy = policy
        # Disable write queue for SQLite to avoid concurrency issues
        self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite
        # Async write queue and worker control
        self._queue: asyncio.Queue[tuple[OpenAIChatCompletion, list[OpenAIMessageParam]]] | None = None
        self._worker_tasks: list[asyncio.Task[Any]] = []
        self._max_write_queue_size: int = config.max_write_queue_size
        self._num_writers: int = max(1, config.num_writers)
    async def initialize(self):
        """Create the necessary tables if they don't exist."""
        self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config))
@ -42,23 +66,109 @@ class InferenceStore:
            },
        )
        if self.enable_write_queue:
            self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
            for _ in range(self._num_writers):
                self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
        else:
            logger.info("Write queue disabled for SQLite to avoid concurrency issues")
    async def shutdown(self) -> None:
        if not self._worker_tasks:
            return
        if self._queue is not None:
            await self._queue.join()
        for t in self._worker_tasks:
            if not t.done():
                t.cancel()
        for t in self._worker_tasks:
            try:
                await t
            except asyncio.CancelledError:
                pass
        self._worker_tasks.clear()
    async def flush(self) -> None:
        """Wait for all queued writes to complete. Useful for testing."""
        if self.enable_write_queue and self._queue is not None:
            await self._queue.join()
    async def store_chat_completion(
        self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam]
    ) -> None:
-        if not self.sql_store:
+        if self.enable_write_queue:
            if self._queue is None:
                raise ValueError("Inference store is not initialized")
            try:
                self._queue.put_nowait((chat_completion, input_messages))
            except asyncio.QueueFull:
                logger.warning(
                    f"Write queue full; adding chat completion id={getattr(chat_completion, 'id', '<unknown>')}"
                )
                await self._queue.put((chat_completion, input_messages))
        else:
            await self._write_chat_completion(chat_completion, input_messages)
    async def _worker_loop(self) -> None:
        assert self._queue is not None
        while True:
            try:
                item = await self._queue.get()
            except asyncio.CancelledError:
                break
            chat_completion, input_messages = item
            try:
                await self._write_chat_completion(chat_completion, input_messages)
            except Exception as e:  # noqa: BLE001
                logger.error(f"Error writing chat completion: {e}")
            finally:
                self._queue.task_done()
    async def _write_chat_completion(
        self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam]
    ) -> None:
        if self.sql_store is None:
            raise ValueError("Inference store is not initialized")
        data = chat_completion.model_dump()
-
+        record_data = {
        await self.sql_store.insert(
            table="chat_completions",
            data={
            "id": data["id"],
            "created": data["created"],
            "model": data["model"],
            "choices": data["choices"],
            "input_messages": [message.model_dump() for message in input_messages],
-            },
+        }
        try:
            await self.sql_store.insert(
                table="chat_completions",
                data=record_data,
            )
        except IntegrityError as e:
            # Duplicate chat completion IDs can be generated during tests especially if they are replaying
            # recorded responses across different tests. No need to warn or error under those circumstances.
            # In the wild, this is not likely to happen at all (no evidence) so we aren't really hiding any problem.
            # Check if it's a unique constraint violation
            error_message = str(e.orig) if e.orig else str(e)
            if self._is_unique_constraint_error(error_message):
                # Update the existing record instead
                await self.sql_store.update(table="chat_completions", data=record_data, where={"id": data["id"]})
            else:
                # Re-raise if it's not a unique constraint error
                raise
    def _is_unique_constraint_error(self, error_message: str) -> bool:
        """Check if the error is specifically a unique constraint violation."""
        error_lower = error_message.lower()
        return any(
            indicator in error_lower
            for indicator in [
                "unique constraint failed",  # SQLite
                "duplicate key",  # PostgreSQL
                "unique violation",  # PostgreSQL alternative
                "duplicate entry",  # MySQL
            ]
        )
    async def list_chat_completions(
--- a/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py
+++ b/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py
@ -172,6 +172,20 @@ class AuthorizedSqlStore:
        return results.data[0] if results.data else None
    async def update(self, table: str, data: Mapping[str, Any], where: Mapping[str, Any]) -> None:
        """Update rows with automatic access control attribute capture."""
        enhanced_data = dict(data)
        current_user = get_authenticated_user()
        if current_user:
            enhanced_data["owner_principal"] = current_user.principal
            enhanced_data["access_attributes"] = current_user.attributes
        else:
            enhanced_data["owner_principal"] = None
            enhanced_data["access_attributes"] = None
        await self.sql_store.update(table, enhanced_data, where)
    async def delete(self, table: str, where: Mapping[str, Any]) -> None:
        """Delete rows with automatic access control filtering."""
        await self.sql_store.delete(table, where)
--- a/llama_stack/providers/utils/telemetry/tracing.py
+++ b/llama_stack/providers/utils/telemetry/tracing.py
@ -18,6 +18,7 @@ from functools import wraps
 from typing import Any
 from llama_stack.apis.telemetry import (
    Event,
    LogSeverity,
    Span,
    SpanEndPayload,
@ -98,7 +99,7 @@ class BackgroundLogger:
    def __init__(self, api: Telemetry, capacity: int = 100000):
        self.api = api
        self.log_queue: queue.Queue[Any] = queue.Queue(maxsize=capacity)
-        self.worker_thread = threading.Thread(target=self._process_logs, daemon=True)
+        self.worker_thread = threading.Thread(target=self._worker, daemon=True)
        self.worker_thread.start()
        self._last_queue_full_log_time: float = 0.0
        self._dropped_since_last_notice: int = 0
@ -118,12 +119,16 @@ class BackgroundLogger:
                self._last_queue_full_log_time = current_time
                self._dropped_since_last_notice = 0
-    def _process_logs(self):
+    def _worker(self):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self._process_logs())
    async def _process_logs(self):
        while True:
            try:
                event = self.log_queue.get()
-                # figure out how to use a thread's native loop
+                await self.api.log_event(event)
                asyncio.run(self.api.log_event(event))
            except Exception:
                import traceback
@ -136,6 +141,19 @@ class BackgroundLogger:
        self.log_queue.join()
 def enqueue_event(event: Event) -> None:
    """Enqueue a telemetry event to the background logger if available.
    This provides a non-blocking path for routers and other hot paths to
    submit telemetry without awaiting the Telemetry API, reducing contention
    with the main event loop.
    """
    global BACKGROUND_LOGGER
    if BACKGROUND_LOGGER is None:
        raise RuntimeError("Telemetry API not initialized")
    BACKGROUND_LOGGER.log_event(event)
 class TraceContext:
    spans: list[Span] = []
@ -256,11 +274,7 @@ class TelemetryHandler(logging.Handler):
        if record.module in ("asyncio", "selector_events"):
            return
-        global CURRENT_TRACE_CONTEXT, BACKGROUND_LOGGER
+        global CURRENT_TRACE_CONTEXT
        if BACKGROUND_LOGGER is None:
            raise RuntimeError("Telemetry API not initialized")
        context = CURRENT_TRACE_CONTEXT.get()
        if context is None:
            return
@ -269,7 +283,7 @@ class TelemetryHandler(logging.Handler):
        if span is None:
            return
-        BACKGROUND_LOGGER.log_event(
+        enqueue_event(
            UnstructuredLogEvent(
                trace_id=span.trace_id,
                span_id=span.span_id,
--- a/llama_stack/providers/utils/tools/mcp.py
+++ b/llama_stack/providers/utils/tools/mcp.py
@ -67,6 +67,38 @@ async def client_wrapper(endpoint: str, headers: dict[str, str]) -> AsyncGenerat
                    raise AuthenticationRequiredError(exc) from exc
            if i == len(connection_strategies) - 1:
                raise
        except* httpx.ConnectError as eg:
            # Connection refused, server down, network unreachable
            if i == len(connection_strategies) - 1:
                error_msg = f"Failed to connect to MCP server at {endpoint}: Connection refused"
                logger.error(f"MCP connection error: {error_msg}")
                raise ConnectionError(error_msg) from eg
            else:
                logger.warning(
                    f"failed to connect to MCP server at {endpoint} via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
                )
        except* httpx.TimeoutException as eg:
            # Request timeout, server too slow
            if i == len(connection_strategies) - 1:
                error_msg = f"MCP server at {endpoint} timed out"
                logger.error(f"MCP timeout error: {error_msg}")
                raise TimeoutError(error_msg) from eg
            else:
                logger.warning(
                    f"MCP server at {endpoint} timed out via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
                )
        except* httpx.RequestError as eg:
            # DNS resolution failures, network errors, invalid URLs
            if i == len(connection_strategies) - 1:
                # Get the first exception's message for the error string
                exc_msg = str(eg.exceptions[0]) if eg.exceptions else "Unknown error"
                error_msg = f"Network error connecting to MCP server at {endpoint}: {exc_msg}"
                logger.error(f"MCP network error: {error_msg}")
                raise ConnectionError(error_msg) from eg
            else:
                logger.warning(
                    f"network error connecting to MCP server at {endpoint} via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
                )
        except* McpError:
            if i < len(connection_strategies) - 1:
                logger.warning(
--- a/llama_stack/testing/inference_recorder.py
+++ b/llama_stack/testing/inference_recorder.py
@ -30,6 +30,9 @@ from openai.types.completion_choice import CompletionChoice
 CompletionChoice.model_fields["finish_reason"].annotation = Literal["stop", "length", "content_filter"] | None
 CompletionChoice.model_rebuild()
 REPO_ROOT = Path(__file__).parent.parent.parent
 DEFAULT_STORAGE_DIR = REPO_ROOT / "tests/integration/recordings"
 class InferenceMode(StrEnum):
    LIVE = "live"
@ -51,7 +54,7 @@ def normalize_request(method: str, url: str, headers: dict[str, Any], body: dict
 def get_inference_mode() -> InferenceMode:
-    return InferenceMode(os.environ.get("LLAMA_STACK_TEST_INFERENCE_MODE", "live").lower())
+    return InferenceMode(os.environ.get("LLAMA_STACK_TEST_INFERENCE_MODE", "replay").lower())
 def setup_inference_recording():
@ -60,28 +63,18 @@ def setup_inference_recording():
    to increase their reliability and reduce reliance on expensive, external services.
    Currently, this is only supported for OpenAI and Ollama clients. These should cover the vast majority of use cases.
    Calls to the /models endpoint are not currently trapped. We probably need to add support for this.
-    Two environment variables are required:
+    Two environment variables are supported:
-    - LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', or 'replay'.
+    - LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', or 'replay'. Default is 'replay'.
-    - LLAMA_STACK_TEST_RECORDING_DIR: The directory to store the recordings in.
+    - LLAMA_STACK_TEST_RECORDING_DIR: The directory to store the recordings in. Default is 'tests/integration/recordings'.
-    The recordings are stored in a SQLite database and a JSON file for each request. The SQLite database is used to
+    The recordings are stored as JSON files.
    quickly find the correct recording for a given request. The JSON files are used to store the request and response
    bodies.
    """
    mode = get_inference_mode()
    if mode not in InferenceMode:
        raise ValueError(f"Invalid LLAMA_STACK_TEST_INFERENCE_MODE: {mode}. Must be 'live', 'record', or 'replay'")
    if mode == InferenceMode.LIVE:
        return None
-    if "LLAMA_STACK_TEST_RECORDING_DIR" not in os.environ:
+    storage_dir = os.environ.get("LLAMA_STACK_TEST_RECORDING_DIR", DEFAULT_STORAGE_DIR)
        raise ValueError("LLAMA_STACK_TEST_RECORDING_DIR must be set for recording or replaying")
    storage_dir = os.environ["LLAMA_STACK_TEST_RECORDING_DIR"]
    return inference_recording(mode=mode, storage_dir=storage_dir)
@ -134,8 +127,8 @@ class ResponseStorage:
    def store_recording(self, request_hash: str, request: dict[str, Any], response: dict[str, Any]):
        """Store a request/response pair."""
        # Generate unique response filename
-        response_file = f"{request_hash[:12]}.json"
+        short_hash = request_hash[:12]
-        response_path = self.responses_dir / response_file
+        response_file = f"{short_hash}.json"
        # Serialize response body if needed
        serialized_response = dict(response)
@ -147,6 +140,14 @@ class ResponseStorage:
                # Handle single response
                serialized_response["body"] = _serialize_response(serialized_response["body"])
        # If this is an Ollama /api/tags recording, include models digest in filename to distinguish variants
        endpoint = request.get("endpoint")
        if endpoint in ("/api/tags", "/v1/models"):
            digest = _model_identifiers_digest(endpoint, response)
            response_file = f"models-{short_hash}-{digest}.json"
        response_path = self.responses_dir / response_file
        # Save response to JSON file
        with open(response_path, "w") as f:
            json.dump({"request": request, "response": serialized_response}, f, indent=2)
@ -161,6 +162,17 @@ class ResponseStorage:
        if not response_path.exists():
            return None
        return _recording_from_file(response_path)
    def _model_list_responses(self, short_hash: str) -> list[dict[str, Any]]:
        results: list[dict[str, Any]] = []
        for path in self.responses_dir.glob(f"models-{short_hash}-*.json"):
            data = _recording_from_file(path)
            results.append(data)
        return results
 def _recording_from_file(response_path) -> dict[str, Any]:
    with open(response_path) as f:
        data = json.load(f)
@ -176,6 +188,61 @@ class ResponseStorage:
    return cast(dict[str, Any], data)
 def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str:
    def _extract_model_identifiers():
        """Extract a stable set of identifiers for model-list endpoints.
        Supported endpoints:
        - '/api/tags' (Ollama): response body has 'models': [ { name/model/digest/id/... }, ... ]
        - '/v1/models' (OpenAI): response body has 'data': [ { id: ... }, ... ]
        Returns a list of unique identifiers or None if structure doesn't match.
        """
        body = response["body"]
        if endpoint == "/api/tags":
            items = body.get("models")
            idents = [m.model for m in items]
        else:
            items = body.get("data")
            idents = [m.id for m in items]
        return sorted(set(idents))
    identifiers = _extract_model_identifiers()
    return hashlib.sha1(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8]
 def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None:
    """Return a single, unioned recording for supported model-list endpoints."""
    seen: dict[str, dict[str, Any]] = {}
    for rec in records:
        body = rec["response"]["body"]
        if endpoint == "/api/tags":
            items = body.models
        elif endpoint == "/v1/models":
            items = body.data
        else:
            items = []
        for m in items:
            if endpoint == "/v1/models":
                key = m.id
            else:
                key = m.model
            seen[key] = m
    ordered = [seen[k] for k in sorted(seen.keys())]
    canonical = records[0]
    canonical_req = canonical.get("request", {})
    if isinstance(canonical_req, dict):
        canonical_req["endpoint"] = endpoint
    if endpoint == "/v1/models":
        body = {"data": ordered, "object": "list"}
    else:
        from ollama import ListResponse
        body = ListResponse(models=ordered)
    return {"request": canonical_req, "response": {"body": body, "is_streaming": False}}
 async def _patched_inference_method(original_method, self, client_type, endpoint, *args, **kwargs):
    global _current_mode, _current_storage
@ -195,8 +262,6 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
        raise ValueError(f"Unknown client type: {client_type}")
    url = base_url.rstrip("/") + endpoint
    # Normalize request for matching
    method = "POST"
    headers = {}
    body = kwargs
@ -204,6 +269,11 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
    request_hash = normalize_request(method, url, headers, body)
    if _current_mode == InferenceMode.REPLAY:
        # Special handling for model-list endpoints: return union of all responses
        if endpoint in ("/api/tags", "/v1/models"):
            records = _current_storage._model_list_responses(request_hash[:12])
            recording = _combine_model_list_responses(endpoint, records)
        else:
            recording = _current_storage.find_recording(request_hash)
        if recording:
            response_body = recording["response"]["body"]
@ -222,7 +292,7 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
                f"No recorded response found for request hash: {request_hash}\n"
                f"Request: {method} {url} {body}\n"
                f"Model: {body.get('model', 'unknown')}\n"
-                f"To record this response, run with LLAMA_STACK_INFERENCE_MODE=record"
+                f"To record this response, run with LLAMA_STACK_TEST_INFERENCE_MODE=record"
            )
    elif _current_mode == InferenceMode.RECORD:
@ -274,12 +344,14 @@ def patch_inference_clients():
    from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
    from openai.resources.completions import AsyncCompletions
    from openai.resources.embeddings import AsyncEmbeddings
    from openai.resources.models import AsyncModels
    # Store original methods for both OpenAI and Ollama clients
    _original_methods = {
        "chat_completions_create": AsyncChatCompletions.create,
        "completions_create": AsyncCompletions.create,
        "embeddings_create": AsyncEmbeddings.create,
        "models_list": AsyncModels.list,
        "ollama_generate": OllamaAsyncClient.generate,
        "ollama_chat": OllamaAsyncClient.chat,
        "ollama_embed": OllamaAsyncClient.embed,
@ -304,10 +376,16 @@ def patch_inference_clients():
            _original_methods["embeddings_create"], self, "openai", "/v1/embeddings", *args, **kwargs
        )
    async def patched_models_list(self, *args, **kwargs):
        return await _patched_inference_method(
            _original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs
        )
    # Apply OpenAI patches
    AsyncChatCompletions.create = patched_chat_completions_create
    AsyncCompletions.create = patched_completions_create
    AsyncEmbeddings.create = patched_embeddings_create
    AsyncModels.list = patched_models_list
    # Create patched methods for Ollama client
    async def patched_ollama_generate(self, *args, **kwargs):
@ -361,11 +439,13 @@ def unpatch_inference_clients():
    from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
    from openai.resources.completions import AsyncCompletions
    from openai.resources.embeddings import AsyncEmbeddings
    from openai.resources.models import AsyncModels
    # Restore OpenAI client methods
    AsyncChatCompletions.create = _original_methods["chat_completions_create"]
    AsyncCompletions.create = _original_methods["completions_create"]
    AsyncEmbeddings.create = _original_methods["embeddings_create"]
    AsyncModels.list = _original_methods["models_list"]
    # Restore Ollama client methods if they were patched
    OllamaAsyncClient.generate = _original_methods["ollama_generate"]
@ -379,16 +459,10 @@ def unpatch_inference_clients():
@contextmanager
-def inference_recording(mode: str = "live", storage_dir: str | Path | None = None) -> Generator[None, None, None]:
+def inference_recording(mode: str, storage_dir: str | Path | None = None) -> Generator[None, None, None]:
    """Context manager for inference recording/replaying."""
    global _current_mode, _current_storage
    # Set defaults
    if storage_dir is None:
        storage_dir_path = Path.home() / ".llama" / "recordings"
    else:
        storage_dir_path = Path(storage_dir)
    # Store previous state
    prev_mode = _current_mode
    prev_storage = _current_storage
@ -397,7 +471,9 @@ def inference_recording(mode: str = "live", storage_dir: str | Path | None = Non
        _current_mode = mode
        if mode in ["record", "replay"]:
-            _current_storage = ResponseStorage(storage_dir_path)
+            if storage_dir is None:
                raise ValueError("storage_dir is required for record and replay modes")
            _current_storage = ResponseStorage(Path(storage_dir))
            patch_inference_clients()
        yield
--- a/llama_stack/ui/package-lock.json
+++ b/llama_stack/ui/package-lock.json
@ -10,7 +10,7 @@
      "dependencies": {
        "@radix-ui/react-collapsible": "^1.1.12",
        "@radix-ui/react-dialog": "^1.1.13",
-        "@radix-ui/react-dropdown-menu": "^2.1.14",
+        "@radix-ui/react-dropdown-menu": "^2.1.16",
        "@radix-ui/react-select": "^2.2.5",
        "@radix-ui/react-separator": "^1.1.7",
        "@radix-ui/react-slot": "^1.2.3",
@ -18,18 +18,18 @@
        "class-variance-authority": "^0.7.1",
        "clsx": "^2.1.1",
        "framer-motion": "^12.23.12",
-        "llama-stack-client": "^0.2.20",
+        "llama-stack-client": "^0.2.21",
-        "lucide-react": "^0.510.0",
+        "lucide-react": "^0.542.0",
        "next": "15.3.3",
        "next-auth": "^4.24.11",
        "next-themes": "^0.4.6",
        "react": "^19.0.0",
-        "react-dom": "^19.0.0",
+        "react-dom": "^19.1.1",
        "react-markdown": "^10.1.0",
        "remark-gfm": "^4.0.1",
        "remeda": "^2.30.0",
        "shiki": "^1.29.2",
-        "sonner": "^2.0.6",
+        "sonner": "^2.0.7",
        "tailwind-merge": "^3.3.1"
      },
      "devDependencies": {
@ -2066,12 +2066,35 @@
      "license": "MIT"
    },
    "node_modules/@radix-ui/react-arrow": {
-      "version": "1.1.6",
+      "version": "1.1.7",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.6.tgz",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.7.tgz",
-      "integrity": "sha512-2JMfHJf/eVnwq+2dewT3C0acmCWD3XiVA1Da+jTDqo342UlU13WvXtqHhG+yJw5JeQmu4ue2eMy6gcEArLBlcw==",
+      "integrity": "sha512-F+M1tLhO+mlQaOWspE8Wstg+z6PwxwRd8oQ8IXceWz92kfAmalTRf0EjrouQeo7QssEPfCn05B4Ihs1K9WQ/7w==",
      "license": "MIT",
      "dependencies": {
-        "@radix-ui/react-primitive": "2.1.2"
+        "@radix-ui/react-primitive": "2.1.3"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-arrow/node_modules/@radix-ui/react-primitive": {
      "version": "2.1.3",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
      "integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/react-slot": "1.2.3"
      },
      "peerDependencies": {
        "@types/react": "*",
@ -2172,15 +2195,15 @@
      }
    },
    "node_modules/@radix-ui/react-collection": {
-      "version": "1.1.6",
+      "version": "1.1.7",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-collection/-/react-collection-1.1.6.tgz",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-collection/-/react-collection-1.1.7.tgz",
-      "integrity": "sha512-PbhRFK4lIEw9ADonj48tiYWzkllz81TM7KVYyyMMw2cwHO7D5h4XKEblL8NlaRisTK3QTe6tBEhDccFUryxHBQ==",
+      "integrity": "sha512-Fh9rGN0MoI4ZFUNyfFVNU4y9LUz93u9/0K+yLgA2bwRojxM8JU1DyvvMBabnZPBgMWREAJvU2jjVzq+LrFUglw==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-context": "1.1.2",
-        "@radix-ui/react-primitive": "2.1.2",
+        "@radix-ui/react-primitive": "2.1.3",
-        "@radix-ui/react-slot": "1.2.2"
+        "@radix-ui/react-slot": "1.2.3"
      },
      "peerDependencies": {
        "@types/react": "*",
@ -2197,21 +2220,26 @@
        }
      }
    },
-    "node_modules/@radix-ui/react-collection/node_modules/@radix-ui/react-slot": {
+    "node_modules/@radix-ui/react-collection/node_modules/@radix-ui/react-primitive": {
-      "version": "1.2.2",
+      "version": "2.1.3",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.2.2.tgz",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
-      "integrity": "sha512-y7TBO4xN4Y94FvcWIOIh18fM4R1A8S4q1jhoz4PNzOoHsFcN8pogcFmZrTYAm4F9VRUrWP/Mw7xSKybIeRI+CQ==",
+      "integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
      "license": "MIT",
      "dependencies": {
-        "@radix-ui/react-compose-refs": "1.1.2"
+        "@radix-ui/react-slot": "1.2.3"
      },
      "peerDependencies": {
        "@types/react": "*",
-        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
@ -2342,17 +2370,17 @@
      }
    },
    "node_modules/@radix-ui/react-dropdown-menu": {
-      "version": "2.1.14",
+      "version": "2.1.16",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-dropdown-menu/-/react-dropdown-menu-2.1.14.tgz",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-dropdown-menu/-/react-dropdown-menu-2.1.16.tgz",
-      "integrity": "sha512-lzuyNjoWOoaMFE/VC5FnAAYM16JmQA8ZmucOXtlhm2kKR5TSU95YLAueQ4JYuRmUJmBvSqXaVFGIfuukybwZJQ==",
+      "integrity": "sha512-1PLGQEynI/3OX/ftV54COn+3Sud/Mn8vALg2rWnBLnRaGtJDduNW/22XjlGgPdpcIbiQxjKtb7BkcjP00nqfJw==",
      "license": "MIT",
      "dependencies": {
-        "@radix-ui/primitive": "1.1.2",
+        "@radix-ui/primitive": "1.1.3",
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-context": "1.1.2",
        "@radix-ui/react-id": "1.1.1",
-        "@radix-ui/react-menu": "2.1.14",
+        "@radix-ui/react-menu": "2.1.16",
-        "@radix-ui/react-primitive": "2.1.2",
+        "@radix-ui/react-primitive": "2.1.3",
        "@radix-ui/react-use-controllable-state": "1.2.2"
      },
      "peerDependencies": {
@ -2370,6 +2398,35 @@
        }
      }
    },
    "node_modules/@radix-ui/react-dropdown-menu/node_modules/@radix-ui/primitive": {
      "version": "1.1.3",
      "resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz",
      "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
      "license": "MIT"
    },
    "node_modules/@radix-ui/react-dropdown-menu/node_modules/@radix-ui/react-primitive": {
      "version": "2.1.3",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
      "integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/react-slot": "1.2.3"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-focus-guards": {
      "version": "1.1.2",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.1.2.tgz",
@ -2429,26 +2486,26 @@
      }
    },
    "node_modules/@radix-ui/react-menu": {
-      "version": "2.1.14",
+      "version": "2.1.16",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-menu/-/react-menu-2.1.14.tgz",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-menu/-/react-menu-2.1.16.tgz",
-      "integrity": "sha512-0zSiBAIFq9GSKoSH5PdEaQeRB3RnEGxC+H2P0egtnKoKKLNBH8VBHyVO6/jskhjAezhOIplyRUj7U2lds9A+Yg==",
+      "integrity": "sha512-72F2T+PLlphrqLcAotYPp0uJMr5SjP5SL01wfEspJbru5Zs5vQaSHb4VB3ZMJPimgHHCHG7gMOeOB9H3Hdmtxg==",
      "license": "MIT",
      "dependencies": {
-        "@radix-ui/primitive": "1.1.2",
+        "@radix-ui/primitive": "1.1.3",
-        "@radix-ui/react-collection": "1.1.6",
+        "@radix-ui/react-collection": "1.1.7",
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-context": "1.1.2",
        "@radix-ui/react-direction": "1.1.1",
-        "@radix-ui/react-dismissable-layer": "1.1.9",
+        "@radix-ui/react-dismissable-layer": "1.1.11",
-        "@radix-ui/react-focus-guards": "1.1.2",
+        "@radix-ui/react-focus-guards": "1.1.3",
-        "@radix-ui/react-focus-scope": "1.1.6",
+        "@radix-ui/react-focus-scope": "1.1.7",
        "@radix-ui/react-id": "1.1.1",
-        "@radix-ui/react-popper": "1.2.6",
+        "@radix-ui/react-popper": "1.2.8",
-        "@radix-ui/react-portal": "1.1.8",
+        "@radix-ui/react-portal": "1.1.9",
-        "@radix-ui/react-presence": "1.1.4",
+        "@radix-ui/react-presence": "1.1.5",
-        "@radix-ui/react-primitive": "2.1.2",
+        "@radix-ui/react-primitive": "2.1.3",
-        "@radix-ui/react-roving-focus": "1.1.9",
+        "@radix-ui/react-roving-focus": "1.1.11",
-        "@radix-ui/react-slot": "1.2.2",
+        "@radix-ui/react-slot": "1.2.3",
        "@radix-ui/react-use-callback-ref": "1.1.1",
        "aria-hidden": "^1.2.4",
        "react-remove-scroll": "^2.6.3"
@ -2468,14 +2525,44 @@
        }
      }
    },
-    "node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-slot": {
+    "node_modules/@radix-ui/react-menu/node_modules/@radix-ui/primitive": {
-      "version": "1.2.2",
+      "version": "1.1.3",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.2.2.tgz",
+      "resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz",
-      "integrity": "sha512-y7TBO4xN4Y94FvcWIOIh18fM4R1A8S4q1jhoz4PNzOoHsFcN8pogcFmZrTYAm4F9VRUrWP/Mw7xSKybIeRI+CQ==",
+      "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
      "license": "MIT"
    },
    "node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-dismissable-layer": {
      "version": "1.1.11",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.11.tgz",
      "integrity": "sha512-Nqcp+t5cTB8BinFkZgXiMJniQH0PsUt2k51FUhbdfeKvc4ACcG2uQniY/8+h1Yv6Kza4Q7lD7PQV0z0oicE0Mg==",
      "license": "MIT",
      "dependencies": {
-        "@radix-ui/react-compose-refs": "1.1.2"
+        "@radix-ui/primitive": "1.1.3",
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-primitive": "2.1.3",
        "@radix-ui/react-use-callback-ref": "1.1.1",
        "@radix-ui/react-use-escape-keydown": "1.1.1"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-focus-guards": {
      "version": "1.1.3",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.1.3.tgz",
      "integrity": "sha512-0rFg/Rj2Q62NCm62jZw0QX7a3sz6QCQU0LpZdNrJX8byRGaGVTqbrW9jAoIAHyMQqsNpeZ81YgSizOt5WXq0Pw==",
      "license": "MIT",
      "peerDependencies": {
        "@types/react": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
@ -2486,17 +2573,113 @@
        }
      }
    },
    "node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-focus-scope": {
      "version": "1.1.7",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-focus-scope/-/react-focus-scope-1.1.7.tgz",
      "integrity": "sha512-t2ODlkXBQyn7jkl6TNaw/MtVEVvIGelJDCG41Okq/KwUsJBwQ4XVZsHAVUkK4mBv3ewiAS3PGuUWuY2BoK4ZUw==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-primitive": "2.1.3",
        "@radix-ui/react-use-callback-ref": "1.1.1"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-portal": {
      "version": "1.1.9",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz",
      "integrity": "sha512-bpIxvq03if6UNwXZ+HTK71JLh4APvnXntDc6XOX8UVq4XQOVl7lwok0AvIl+b8zgCw3fSaVTZMpAPPagXbKmHQ==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/react-primitive": "2.1.3",
        "@radix-ui/react-use-layout-effect": "1.1.1"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-presence": {
      "version": "1.1.5",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-presence/-/react-presence-1.1.5.tgz",
      "integrity": "sha512-/jfEwNDdQVBCNvjkGit4h6pMOzq8bHkopq458dPt2lMjx+eBQUohZNG9A7DtO/O5ukSbxuaNGXMjHicgwy6rQQ==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-use-layout-effect": "1.1.1"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-primitive": {
      "version": "2.1.3",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
      "integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/react-slot": "1.2.3"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-popper": {
-      "version": "1.2.6",
+      "version": "1.2.8",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.6.tgz",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.8.tgz",
-      "integrity": "sha512-7iqXaOWIjDBfIG7aq8CUEeCSsQMLFdn7VEE8TaFz704DtEzpPHR7w/uuzRflvKgltqSAImgcmxQ7fFX3X7wasg==",
+      "integrity": "sha512-0NJQ4LFFUuWkE7Oxf0htBKS6zLkkjBH+hM1uk7Ng705ReR8m/uelduy1DBo0PyBXPKVnBA6YBlU94MBGXrSBCw==",
      "license": "MIT",
      "dependencies": {
        "@floating-ui/react-dom": "^2.0.0",
-        "@radix-ui/react-arrow": "1.1.6",
+        "@radix-ui/react-arrow": "1.1.7",
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-context": "1.1.2",
-        "@radix-ui/react-primitive": "2.1.2",
+        "@radix-ui/react-primitive": "2.1.3",
        "@radix-ui/react-use-callback-ref": "1.1.1",
        "@radix-ui/react-use-layout-effect": "1.1.1",
        "@radix-ui/react-use-rect": "1.1.1",
@ -2518,6 +2701,29 @@
        }
      }
    },
    "node_modules/@radix-ui/react-popper/node_modules/@radix-ui/react-primitive": {
      "version": "2.1.3",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
      "integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/react-slot": "1.2.3"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-portal": {
      "version": "1.1.8",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.8.tgz",
@ -2608,18 +2814,18 @@
      }
    },
    "node_modules/@radix-ui/react-roving-focus": {
-      "version": "1.1.9",
+      "version": "1.1.11",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-roving-focus/-/react-roving-focus-1.1.9.tgz",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-roving-focus/-/react-roving-focus-1.1.11.tgz",
-      "integrity": "sha512-ZzrIFnMYHHCNqSNCsuN6l7wlewBEq0O0BCSBkabJMFXVO51LRUTq71gLP1UxFvmrXElqmPjA5VX7IqC9VpazAQ==",
+      "integrity": "sha512-7A6S9jSgm/S+7MdtNDSb+IU859vQqJ/QAtcYQcfFC6W8RS4IxIZDldLR0xqCFZ6DCyrQLjLPsxtTNch5jVA4lA==",
      "license": "MIT",
      "dependencies": {
-        "@radix-ui/primitive": "1.1.2",
+        "@radix-ui/primitive": "1.1.3",
-        "@radix-ui/react-collection": "1.1.6",
+        "@radix-ui/react-collection": "1.1.7",
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-context": "1.1.2",
        "@radix-ui/react-direction": "1.1.1",
        "@radix-ui/react-id": "1.1.1",
-        "@radix-ui/react-primitive": "2.1.2",
+        "@radix-ui/react-primitive": "2.1.3",
        "@radix-ui/react-use-callback-ref": "1.1.1",
        "@radix-ui/react-use-controllable-state": "1.2.2"
      },
@ -2638,6 +2844,35 @@
        }
      }
    },
    "node_modules/@radix-ui/react-roving-focus/node_modules/@radix-ui/primitive": {
      "version": "1.1.3",
      "resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz",
      "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
      "license": "MIT"
    },
    "node_modules/@radix-ui/react-roving-focus/node_modules/@radix-ui/react-primitive": {
      "version": "2.1.3",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
      "integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/react-slot": "1.2.3"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-select": {
      "version": "2.2.5",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-select/-/react-select-2.2.5.tgz",
@ -2681,55 +2916,6 @@
        }
      }
    },
    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-arrow": {
      "version": "1.1.7",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.7.tgz",
      "integrity": "sha512-F+M1tLhO+mlQaOWspE8Wstg+z6PwxwRd8oQ8IXceWz92kfAmalTRf0EjrouQeo7QssEPfCn05B4Ihs1K9WQ/7w==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/react-primitive": "2.1.3"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-collection": {
      "version": "1.1.7",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-collection/-/react-collection-1.1.7.tgz",
      "integrity": "sha512-Fh9rGN0MoI4ZFUNyfFVNU4y9LUz93u9/0K+yLgA2bwRojxM8JU1DyvvMBabnZPBgMWREAJvU2jjVzq+LrFUglw==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-context": "1.1.2",
        "@radix-ui/react-primitive": "2.1.3",
        "@radix-ui/react-slot": "1.2.3"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-dismissable-layer": {
      "version": "1.1.10",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.10.tgz",
@ -2965,29 +3151,6 @@
      "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
      "license": "MIT"
    },
    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-arrow": {
      "version": "1.1.7",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.7.tgz",
      "integrity": "sha512-F+M1tLhO+mlQaOWspE8Wstg+z6PwxwRd8oQ8IXceWz92kfAmalTRf0EjrouQeo7QssEPfCn05B4Ihs1K9WQ/7w==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/react-primitive": "2.1.3"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-dismissable-layer": {
      "version": "1.1.11",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.11.tgz",
@ -3015,38 +3178,6 @@
        }
      }
    },
    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-popper": {
      "version": "1.2.8",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.8.tgz",
      "integrity": "sha512-0NJQ4LFFUuWkE7Oxf0htBKS6zLkkjBH+hM1uk7Ng705ReR8m/uelduy1DBo0PyBXPKVnBA6YBlU94MBGXrSBCw==",
      "license": "MIT",
      "dependencies": {
        "@floating-ui/react-dom": "^2.0.0",
        "@radix-ui/react-arrow": "1.1.7",
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-context": "1.1.2",
        "@radix-ui/react-primitive": "2.1.3",
        "@radix-ui/react-use-callback-ref": "1.1.1",
        "@radix-ui/react-use-layout-effect": "1.1.1",
        "@radix-ui/react-use-rect": "1.1.1",
        "@radix-ui/react-use-size": "1.1.1",
        "@radix-ui/rect": "1.1.1"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-portal": {
      "version": "1.1.9",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz",
@ -3447,6 +3578,13 @@
        "tailwindcss": "4.1.6"
      }
    },
    "node_modules/@tailwindcss/node/node_modules/tailwindcss": {
      "version": "4.1.6",
      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz",
      "integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==",
      "dev": true,
      "license": "MIT"
    },
    "node_modules/@tailwindcss/oxide": {
      "version": "4.1.6",
      "resolved": "https://registry.npmjs.org/@tailwindcss/oxide/-/oxide-4.1.6.tgz",
@ -3707,6 +3845,13 @@
        "tailwindcss": "4.1.6"
      }
    },
    "node_modules/@tailwindcss/postcss/node_modules/tailwindcss": {
      "version": "4.1.6",
      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz",
      "integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==",
      "dev": true,
      "license": "MIT"
    },
    "node_modules/@testing-library/dom": {
      "version": "10.4.1",
      "resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.4.1.tgz",
@ -4079,9 +4224,9 @@
      }
    },
    "node_modules/@types/react-dom": {
-      "version": "19.1.5",
+      "version": "19.1.9",
-      "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.1.5.tgz",
+      "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.1.9.tgz",
-      "integrity": "sha512-CMCjrWucUBZvohgZxkjd6S9h0nZxXjzus6yDfUb+xLxYM7VvjKNH1tQrE9GWLql1XoOP4/Ds3bwFqShHUYraGg==",
+      "integrity": "sha512-qXRuZaOsAdXKFyOhRBg6Lqqc0yay13vN7KrIg4L7N4aaHN68ma9OK3NE1BoDFgFOTfM7zg+3/8+2n8rLUH3OKQ==",
      "devOptional": true,
      "license": "MIT",
      "peerDependencies": {
@ -10147,9 +10292,9 @@
      "license": "MIT"
    },
    "node_modules/llama-stack-client": {
-      "version": "0.2.20",
+      "version": "0.2.21",
-      "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.20.tgz",
+      "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.21.tgz",
-      "integrity": "sha512-1vD5nizTX5JEW8TADxKgy/P1W8YZoPSpdnmfxbdYbWgpQ3BWtbvLS6jmDk7VwVA5fRC4895VfHsRDfS1liHarw==",
+      "integrity": "sha512-rjU2Vx5xStxDYavU8K1An/SYXiQQjroLcK98B+p0Paz/a7OgRao2S0YwvThJjPUyChY4fO03UIXP9LpmHqlXWQ==",
      "license": "MIT",
      "dependencies": {
        "@types/node": "^18.11.18",
@ -10240,9 +10385,9 @@
      "license": "ISC"
    },
    "node_modules/lucide-react": {
-      "version": "0.510.0",
+      "version": "0.542.0",
-      "resolved": "https://registry.npmjs.org/lucide-react/-/lucide-react-0.510.0.tgz",
+      "resolved": "https://registry.npmjs.org/lucide-react/-/lucide-react-0.542.0.tgz",
-      "integrity": "sha512-p8SQRAMVh7NhsAIETokSqDrc5CHnDLbV29mMnzaXx+Vc/hnqQzwI2r0FMWCcoTXnbw2KEjy48xwpGdEL+ck06Q==",
+      "integrity": "sha512-w3hD8/SQB7+lzU2r4VdFyzzOzKnUjTZIF/MQJGSSvni7Llewni4vuViRppfRAa2guOsY5k4jZyxw/i9DQHv+dw==",
      "license": "ISC",
      "peerDependencies": {
        "react": "^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0"
@ -12448,24 +12593,24 @@
      }
    },
    "node_modules/react": {
-      "version": "19.1.0",
+      "version": "19.1.1",
-      "resolved": "https://registry.npmjs.org/react/-/react-19.1.0.tgz",
+      "resolved": "https://registry.npmjs.org/react/-/react-19.1.1.tgz",
-      "integrity": "sha512-FS+XFBNvn3GTAWq26joslQgWNoFu08F4kl0J4CgdNKADkdSGXQyTCnKteIAJy96Br6YbpEU1LSzV5dYtjMkMDg==",
+      "integrity": "sha512-w8nqGImo45dmMIfljjMwOGtbmC/mk4CMYhWIicdSflH91J9TyCyczcPFXJzrZ/ZXcgGRFeP6BU0BEJTw6tZdfQ==",
      "license": "MIT",
      "engines": {
        "node": ">=0.10.0"
      }
    },
    "node_modules/react-dom": {
-      "version": "19.1.0",
+      "version": "19.1.1",
-      "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.1.0.tgz",
+      "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.1.1.tgz",
-      "integrity": "sha512-Xs1hdnE+DyKgeHJeJznQmYMIBG3TKIHJJT95Q58nHLSrElKlGQqDTR2HQ9fx5CN/Gk6Vh/kupBTDLU11/nDk/g==",
+      "integrity": "sha512-Dlq/5LAZgF0Gaz6yiqZCf6VCcZs1ghAJyrsu84Q/GT0gV+mCxbfmKNoGRKBYMJ8IEdGPqu49YWXD02GCknEDkw==",
      "license": "MIT",
      "dependencies": {
        "scheduler": "^0.26.0"
      },
      "peerDependencies": {
-        "react": "^19.1.0"
+        "react": "^19.1.1"
      }
    },
    "node_modules/react-is": {
@ -13285,9 +13430,9 @@
      }
    },
    "node_modules/sonner": {
-      "version": "2.0.6",
+      "version": "2.0.7",
-      "resolved": "https://registry.npmjs.org/sonner/-/sonner-2.0.6.tgz",
+      "resolved": "https://registry.npmjs.org/sonner/-/sonner-2.0.7.tgz",
-      "integrity": "sha512-yHFhk8T/DK3YxjFQXIrcHT1rGEeTLliVzWbO0xN8GberVun2RiBnxAjXAYpZrqwEVHBG9asI/Li8TAAhN9m59Q==",
+      "integrity": "sha512-W6ZN4p58k8aDKA4XPcx2hpIQXBRAgyiWVkYhT7CvK6D3iAu7xjvVyhQHg2/iaKJZ1XVJ4r7XuwGL+WGEK37i9w==",
      "license": "MIT",
      "peerDependencies": {
        "react": "^18.0.0 || ^19.0.0 || ^19.0.0-rc",
@ -13712,9 +13857,9 @@
      }
    },
    "node_modules/tailwindcss": {
-      "version": "4.1.6",
+      "version": "4.1.13",
-      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz",
+      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.13.tgz",
-      "integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==",
+      "integrity": "sha512-i+zidfmTqtwquj4hMEwdjshYYgMbOrPzb9a0M3ZgNa0JMoZeFC6bxZvO8yr8ozS6ix2SDz0+mvryPeBs2TFE+w==",
      "dev": true,
      "license": "MIT"
    },
--- a/llama_stack/ui/package.json
+++ b/llama_stack/ui/package.json
@ -15,7 +15,7 @@
  "dependencies": {
    "@radix-ui/react-collapsible": "^1.1.12",
    "@radix-ui/react-dialog": "^1.1.13",
-    "@radix-ui/react-dropdown-menu": "^2.1.14",
+    "@radix-ui/react-dropdown-menu": "^2.1.16",
    "@radix-ui/react-select": "^2.2.5",
    "@radix-ui/react-separator": "^1.1.7",
    "@radix-ui/react-slot": "^1.2.3",
@ -23,18 +23,18 @@
    "class-variance-authority": "^0.7.1",
    "clsx": "^2.1.1",
    "framer-motion": "^12.23.12",
-    "llama-stack-client": "^0.2.20",
+    "llama-stack-client": "^0.2.21",
-    "lucide-react": "^0.510.0",
+    "lucide-react": "^0.542.0",
    "next": "15.3.3",
    "next-auth": "^4.24.11",
    "next-themes": "^0.4.6",
    "react": "^19.0.0",
-    "react-dom": "^19.0.0",
+    "react-dom": "^19.1.1",
    "react-markdown": "^10.1.0",
    "remark-gfm": "^4.0.1",
    "remeda": "^2.30.0",
    "shiki": "^1.29.2",
-    "sonner": "^2.0.6",
+    "sonner": "^2.0.7",
    "tailwind-merge": "^3.3.1"
  },
  "devDependencies": {
--- a/pyproject.toml
+++ b/pyproject.toml
@ -7,7 +7,7 @@ required-version = ">=0.7.0"
 [project]
 name = "llama_stack"
-version = "0.2.20"
+version = "0.2.21"
 authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
 description = "Llama Stack"
 readme = "README.md"
@ -31,9 +31,8 @@ dependencies = [
    "huggingface-hub>=0.34.0,<1.0",
    "jinja2>=3.1.6",
    "jsonschema",
-    "llama-stack-client>=0.2.20",
+    "llama-stack-client>=0.2.21",
-    "llama-api-client>=0.1.2",
+    "openai>=1.100.0",                                # for expires_after support
    "openai>=1.99.6",
    "prompt-toolkit",
    "python-dotenv",
    "python-jose[cryptography]",
@ -56,7 +55,7 @@ dependencies = [
 ui = [
    "streamlit",
    "pandas",
-    "llama-stack-client>=0.2.20",
+    "llama-stack-client>=0.2.21",
    "streamlit-option-menu",
 ]
@ -81,7 +80,6 @@ dev = [
 unit = [
    "sqlite-vec",
    "ollama",
    "openai",
    "aiosqlite",
    "aiohttp",
    "psycopg2-binary>=2.9.0",
@ -93,7 +91,7 @@ unit = [
    "sqlalchemy[asyncio]>=2.0.41",
    "blobfile",
    "faiss-cpu",
-    "pymilvus>=2.5.12",
+    "pymilvus>=2.6.1",
    "milvus-lite>=2.5.0",
    "litellm",
    "together",
@ -106,7 +104,6 @@ unit = [
 # separately. If you are using "uv" to execute your tests, you can use the "--group" flag to specify extra
 # dependencies.
 test = [
    "openai>=1.100.0",  # for expires_after support
    "aiosqlite",
    "aiohttp",
    "torch>=2.6.0",
@ -115,13 +112,13 @@ test = [
    "psycopg2-binary>=2.9.0",
    "pypdf",
    "mcp",
-    "datasets",
+    "datasets>=4.0.0",
    "autoevals",
    "transformers",
    "sqlalchemy",
    "sqlalchemy[asyncio]>=2.0.41",
    "requests",
-    "pymilvus>=2.5.12",
+    "pymilvus>=2.6.1",
    "milvus-lite>=2.5.0",
    "weaviate-client>=4.16.4",
 ]
@ -146,7 +143,7 @@ docs = [
 ]
 codegen = ["rich", "pydantic", "jinja2>=3.1.6"]
 benchmark = [
-    "locust>=2.37.14",
+    "locust>=2.39.1",
 ]
 [project.urls]
--- a/scripts/get_setup_env.py
+++ b/scripts/get_setup_env.py
@ -0,0 +1,71 @@
 #!/usr/bin/env python3
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 """
 Small helper script to extract environment variables from a test setup.
 Used by integration-tests.sh to set environment variables before starting the server.
 """
 import argparse
 import sys
 from tests.integration.suites import SETUP_DEFINITIONS, SUITE_DEFINITIONS
 def get_setup_env_vars(setup_name, suite_name=None):
    """
    Get environment variables for a setup, with optional suite default fallback.
    Args:
        setup_name: Name of the setup (e.g., 'ollama', 'gpt')
        suite_name: Optional suite name to get default setup if setup_name is None
    Returns:
        Dictionary of environment variables
    """
    # If no setup specified, try to get default from suite
    if not setup_name and suite_name:
        suite = SUITE_DEFINITIONS.get(suite_name)
        if suite and suite.default_setup:
            setup_name = suite.default_setup
    if not setup_name:
        return {}
    setup = SETUP_DEFINITIONS.get(setup_name)
    if not setup:
        print(
            f"Error: Unknown setup '{setup_name}'. Available: {', '.join(sorted(SETUP_DEFINITIONS.keys()))}",
            file=sys.stderr,
        )
        sys.exit(1)
    return setup.env
 def main():
    parser = argparse.ArgumentParser(description="Extract environment variables from a test setup")
    parser.add_argument("--setup", help="Setup name (e.g., ollama, gpt)")
    parser.add_argument("--suite", help="Suite name to get default setup from if --setup not provided")
    parser.add_argument("--format", choices=["bash", "json"], default="bash", help="Output format (default: bash)")
    args = parser.parse_args()
    env_vars = get_setup_env_vars(args.setup, args.suite)
    if args.format == "bash":
        # Output as bash export statements
        for key, value in env_vars.items():
            print(f"export {key}='{value}'")
    elif args.format == "json":
        import json
        print(json.dumps(env_vars))
 if __name__ == "__main__":
    main()
--- a/scripts/github/schedule-record-workflow.sh
+++ b/scripts/github/schedule-record-workflow.sh
@ -14,8 +14,8 @@ set -euo pipefail
 # Default values
 BRANCH=""
 TEST_SUBDIRS=""
-TEST_PROVIDER="ollama"
+TEST_SETUP="ollama"
-RUN_VISION_TESTS=false
+TEST_SUITE="base"
 TEST_PATTERN=""
 # Help function
@ -27,24 +27,24 @@ Trigger the integration test recording workflow remotely. This way you do not ne
 OPTIONS:
    -b, --branch BRANCH         Branch to run the workflow on (defaults to current branch)
-    -s, --test-subdirs DIRS     Comma-separated list of test subdirectories to run (REQUIRED)
+    -t, --suite SUITE           Test suite to use: base, responses, vision, etc. (default: base)
-    -p, --test-provider PROVIDER Test provider to use: vllm or ollama (default: ollama)
+    -p, --setup SETUP           Test setup to use: vllm, ollama, gpt, etc. (default: ollama)
-    -v, --run-vision-tests      Include vision tests in the recording
+    -s, --subdirs DIRS          Comma-separated list of test subdirectories to run (overrides suite)
-    -k, --test-pattern PATTERN  Regex pattern to pass to pytest -k
+    -k, --pattern PATTERN       Regex pattern to pass to pytest -k
    -h, --help                  Show this help message
 EXAMPLES:
    # Record tests for current branch with agents subdirectory
-    $0 --test-subdirs "agents"
+    $0 --subdirs "agents"
    # Record tests for specific branch with vision tests
-    $0 -b my-feature-branch --test-subdirs "inference" --run-vision-tests
+    $0 -b my-feature-branch --suite vision
-    # Record multiple test subdirectories with specific provider
+    # Record multiple test subdirectories with specific setup
-    $0 --test-subdirs "agents,inference" --test-provider vllm
+    $0 --subdirs "agents,inference" --setup vllm
    # Record tests matching a specific pattern
-    $0 --test-subdirs "inference" --test-pattern "test_streaming"
+    $0 --subdirs "inference" --pattern "test_streaming"
 EOF
 }
@ -63,19 +63,19 @@ while [[ $# -gt 0 ]]; do
            BRANCH="$2"
            shift 2
            ;;
-        -s|--test-subdirs)
+        -s|--subdirs)
            TEST_SUBDIRS="$2"
            shift 2
            ;;
-        -p|--test-provider)
+        -p|--setup)
-            TEST_PROVIDER="$2"
+            TEST_SETUP="$2"
            shift 2
            ;;
-        -v|--run-vision-tests)
+        -t|--suite)
-            RUN_VISION_TESTS=true
+            TEST_SUITE="$2"
-            shift
+            shift 2
            ;;
-        -k|--test-pattern)
+        -k|--pattern)
            TEST_PATTERN="$2"
            shift 2
            ;;
@ -92,22 +92,17 @@ while [[ $# -gt 0 ]]; do
 done
 # Validate required parameters
-if [[ -z "$TEST_SUBDIRS" ]]; then
+if [[ -z "$TEST_SUBDIRS" && -z "$TEST_SUITE" ]]; then
-    echo "Error: --test-subdirs is required"
+    echo "Error: --subdirs or --suite is required"
-    echo "Please specify which test subdirectories to run, e.g.:"
+    echo "Please specify which test subdirectories to run or test suite to use, e.g.:"
-    echo "  $0 --test-subdirs \"agents,inference\""
+    echo "  $0 --subdirs \"agents,inference\""
-    echo "  $0 --test-subdirs \"inference\" --run-vision-tests"
+    echo "  $0 --suite vision"
    echo ""
    exit 1
 fi
-# Validate test provider
+# Validate test setup (optional - setups are validated by the workflow itself)
-if [[ "$TEST_PROVIDER" != "vllm" && "$TEST_PROVIDER" != "ollama" ]]; then
+# Common setups: ollama, vllm, gpt, etc.
    echo "❌ Error: Invalid test provider '$TEST_PROVIDER'"
    echo "   Supported providers: vllm, ollama"
    echo "   Example: $0 --test-subdirs \"agents\" --test-provider vllm"
    exit 1
 fi
 # Check if required tools are installed
 if ! command -v gh &> /dev/null; then
@ -237,22 +232,25 @@ fi
 # Build the workflow dispatch command
 echo "Triggering integration test recording workflow..."
 echo "Branch: $BRANCH"
-echo "Test provider: $TEST_PROVIDER"
+echo "Test setup: $TEST_SETUP"
 echo "Test subdirs: $TEST_SUBDIRS"
-echo "Run vision tests: $RUN_VISION_TESTS"
+echo "Test suite: $TEST_SUITE"
 echo "Test pattern: ${TEST_PATTERN:-"(none)"}"
 echo ""
 # Prepare inputs for gh workflow run
-INPUTS="-f test-subdirs='$TEST_SUBDIRS'"
+INPUTS=
-if [[ -n "$TEST_PROVIDER" ]]; then
+if [[ -n "$TEST_SUBDIRS" ]]; then
-    INPUTS="$INPUTS -f test-provider='$TEST_PROVIDER'"
+    INPUTS="$INPUTS -f subdirs='$TEST_SUBDIRS'"
 fi
-if [[ "$RUN_VISION_TESTS" == "true" ]]; then
+if [[ -n "$TEST_SETUP" ]]; then
-    INPUTS="$INPUTS -f run-vision-tests=true"
+    INPUTS="$INPUTS -f test-setup='$TEST_SETUP'"
 fi
 if [[ -n "$TEST_SUITE" ]]; then
    INPUTS="$INPUTS -f suite='$TEST_SUITE'"
 fi
 if [[ -n "$TEST_PATTERN" ]]; then
-    INPUTS="$INPUTS -f test-pattern='$TEST_PATTERN'"
+    INPUTS="$INPUTS -f pattern='$TEST_PATTERN'"
 fi
 # Run the workflow
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@ -13,10 +13,10 @@ set -euo pipefail
 # Default values
 STACK_CONFIG=""
-PROVIDER=""
+TEST_SUITE="base"
 TEST_SETUP=""
 TEST_SUBDIRS=""
 TEST_PATTERN=""
 RUN_VISION_TESTS="false"
 INFERENCE_MODE="replay"
 EXTRA_PARAMS=""
@ -27,25 +27,30 @@ Usage: $0 [OPTIONS]
 Options:
    --stack-config STRING    Stack configuration to use (required)
-    --provider STRING        Provider to use (ollama, vllm, etc.) (required)
+    --suite STRING           Test suite to run (default: 'base')
-    --test-subdirs STRING    Comma-separated list of test subdirectories to run (default: 'inference')
+    --setup STRING           Test setup (models, env) to use (e.g., 'ollama', 'ollama-vision', 'gpt', 'vllm')
    --run-vision-tests       Run vision tests instead of regular tests
    --inference-mode STRING  Inference mode: record or replay (default: replay)
-    --test-pattern STRING    Regex pattern to pass to pytest -k
+    --subdirs STRING         Comma-separated list of test subdirectories to run (overrides suite)
    --pattern STRING         Regex pattern to pass to pytest -k
    --help                   Show this help message
 Suites are defined in tests/integration/suites.py and define which tests to run.
 Setups are defined in tests/integration/setups.py and provide global configuration (models, env).
 You can also specify subdirectories (of tests/integration) to select tests from, which will override the suite.
 Examples:
    # Basic inference tests with ollama
-    $0 --stack-config server:ci-tests --provider ollama
+    $0 --stack-config server:ci-tests --suite base --setup ollama
    # Multiple test directories with vllm
-    $0 --stack-config server:ci-tests --provider vllm --test-subdirs 'inference,agents'
+    $0 --stack-config server:ci-tests --subdirs 'inference,agents' --setup vllm
    # Vision tests with ollama
-    $0 --stack-config server:ci-tests --provider ollama --run-vision-tests
+    $0 --stack-config server:ci-tests --suite vision  # default setup for this suite is ollama-vision
    # Record mode for updating test recordings
-    $0 --stack-config server:ci-tests --provider ollama --inference-mode record
+    $0 --stack-config server:ci-tests --suite base --inference-mode record
 EOF
 }
@ -56,23 +61,23 @@ while [[ $# -gt 0 ]]; do
            STACK_CONFIG="$2"
            shift 2
            ;;
-        --provider)
+        --setup)
-            PROVIDER="$2"
+            TEST_SETUP="$2"
            shift 2
            ;;
-        --test-subdirs)
+        --subdirs)
            TEST_SUBDIRS="$2"
            shift 2
            ;;
-        --run-vision-tests)
+        --suite)
-            RUN_VISION_TESTS="true"
+            TEST_SUITE="$2"
-            shift
+            shift 2
            ;;
        --inference-mode)
            INFERENCE_MODE="$2"
            shift 2
            ;;
-        --test-pattern)
+        --pattern)
            TEST_PATTERN="$2"
            shift 2
            ;;
@ -96,18 +101,23 @@ if [[ -z "$STACK_CONFIG" ]]; then
    exit 1
 fi
-if [[ -z "$PROVIDER" ]]; then
+if [[ -z "$TEST_SETUP" && -n "$TEST_SUBDIRS" ]]; then
-    echo "Error: --provider is required"
+    echo "Error: --test-setup is required when --test-subdirs is provided"
    usage
    exit 1
 fi
 if [[ -z "$TEST_SUITE" && -z "$TEST_SUBDIRS" ]]; then
    echo "Error: --test-suite or --test-subdirs is required"
    exit 1
 fi
 echo "=== Llama Stack Integration Test Runner ==="
 echo "Stack Config: $STACK_CONFIG"
-echo "Provider: $PROVIDER"
+echo "Setup: $TEST_SETUP"
 echo "Test Subdirs: $TEST_SUBDIRS"
 echo "Vision Tests: $RUN_VISION_TESTS"
 echo "Inference Mode: $INFERENCE_MODE"
 echo "Test Suite: $TEST_SUITE"
 echo "Test Subdirs: $TEST_SUBDIRS"
 echo "Test Pattern: $TEST_PATTERN"
 echo ""
@ -122,31 +132,28 @@ echo ""
 # Set environment variables
 export LLAMA_STACK_CLIENT_TIMEOUT=300
 export LLAMA_STACK_TEST_INFERENCE_MODE="$INFERENCE_MODE"
 # Configure provider-specific settings
 if [[ "$PROVIDER" == "ollama" ]]; then
    export OLLAMA_URL="http://0.0.0.0:11434"
    export TEXT_MODEL="ollama/llama3.2:3b-instruct-fp16"
    export SAFETY_MODEL="ollama/llama-guard3:1b"
    EXTRA_PARAMS="--safety-shield=llama-guard"
 else
    export VLLM_URL="http://localhost:8000/v1"
    export TEXT_MODEL="vllm/meta-llama/Llama-3.2-1B-Instruct"
    EXTRA_PARAMS=""
 fi
 THIS_DIR=$(dirname "$0")
 if [[ -n "$TEST_SETUP" ]]; then
    EXTRA_PARAMS="--setup=$TEST_SETUP"
 fi
 # Apply setup-specific environment variables (needed for server startup and tests)
 echo "=== Applying Setup Environment Variables ==="
 # the server needs this
 export LLAMA_STACK_TEST_INFERENCE_MODE="$INFERENCE_MODE"
 SETUP_ENV=$(PYTHONPATH=$THIS_DIR/.. python "$THIS_DIR/get_setup_env.py" --suite "$TEST_SUITE" --setup "$TEST_SETUP" --format bash)
 echo "Setting up environment variables:"
 echo "$SETUP_ENV"
 eval "$SETUP_ENV"
 echo ""
 ROOT_DIR="$THIS_DIR/.."
 cd $ROOT_DIR
 # Set recording directory
 if [[ "$RUN_VISION_TESTS" == "true" ]]; then
    export LLAMA_STACK_TEST_RECORDING_DIR="tests/integration/recordings/vision"
 else
    export LLAMA_STACK_TEST_RECORDING_DIR="tests/integration/recordings"
 fi
 # check if "llama" and "pytest" are available. this script does not use `uv run` given
 # it can be used in a pre-release environment where we have not been able to tell
 # uv about pre-release dependencies properly (yet).
@ -162,6 +169,18 @@ fi
 # Start Llama Stack Server if needed
 if [[ "$STACK_CONFIG" == *"server:"* ]]; then
    stop_server() {
        echo "Stopping Llama Stack Server..."
        pids=$(lsof -i :8321 | awk 'NR>1 {print $2}')
        if [[ -n "$pids" ]]; then
            echo "Killing Llama Stack Server processes: $pids"
            kill -9 $pids
        else
            echo "No Llama Stack Server processes found ?!"
        fi
        echo "Llama Stack Server stopped"
    }
    # check if server is already running
    if curl -s http://localhost:8321/v1/health 2>/dev/null | grep -q "OK"; then
        echo "Llama Stack Server is already running, skipping start"
@ -185,14 +204,16 @@ if [[ "$STACK_CONFIG" == *"server:"* ]]; then
        done
        echo ""
    fi
    trap stop_server EXIT ERR INT TERM
 fi
 # Run tests
 echo "=== Running Integration Tests ==="
 EXCLUDE_TESTS="builtin_tool or safety_with_image or code_interpreter or test_rag"
-# Additional exclusions for vllm provider
+# Additional exclusions for vllm setup
-if [[ "$PROVIDER" == "vllm" ]]; then
+if [[ "$TEST_SETUP" == "vllm" ]]; then
    EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls"
 fi
@ -201,56 +222,12 @@ if [[ -n "$TEST_PATTERN" ]]; then
    PYTEST_PATTERN="${PYTEST_PATTERN} and $TEST_PATTERN"
 fi
 # Run vision tests if specified
 if [[ "$RUN_VISION_TESTS" == "true" ]]; then
    echo "Running vision tests..."
    set +e
    pytest -s -v tests/integration/inference/test_vision_inference.py \
        --stack-config="$STACK_CONFIG" \
        -k "$PYTEST_PATTERN" \
        --vision-model=ollama/llama3.2-vision:11b \
        --embedding-model=sentence-transformers/all-MiniLM-L6-v2 \
        --color=yes $EXTRA_PARAMS \
        --capture=tee-sys
    exit_code=$?
    set -e
    if [ $exit_code -eq 0 ]; then
        echo "✅ Vision tests completed successfully"
    elif [ $exit_code -eq 5 ]; then
        echo "⚠️ No vision tests collected (pattern matched no tests)"
    else
        echo "❌ Vision tests failed"
        exit 1
    fi
    exit 0
 fi
 # Run regular tests
 if [[ -z "$TEST_SUBDIRS" ]]; then
   TEST_SUBDIRS=$(find tests/integration -maxdepth 1 -mindepth 1 -type d |
            sed 's|tests/integration/||' |
            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" |
            sort)
 fi
 echo "Test subdirs to run: $TEST_SUBDIRS"
 if [[ -n "$TEST_SUBDIRS" ]]; then
    # Collect all test files for the specified test types
    TEST_FILES=""
    for test_subdir in $(echo "$TEST_SUBDIRS" | tr ',' '\n'); do
    # Skip certain test types for vllm provider
    if [[ "$PROVIDER" == "vllm" ]]; then
        if [[ "$test_subdir" == "safety" ]] || [[ "$test_subdir" == "post_training" ]] || [[ "$test_subdir" == "tool_runtime" ]]; then
            echo "Skipping $test_subdir for vllm provider"
            continue
        fi
    fi
    if [[ "$STACK_CONFIG" != *"server:"* ]] && [[ "$test_subdir" == "batches" ]]; then
        echo "Skipping $test_subdir for library client until types are supported"
        continue
    fi
        if [[ -d "tests/integration/$test_subdir" ]]; then
            # Find all Python test files in this directory
            test_files=$(find tests/integration/$test_subdir -name "test_*.py" -o -name "*_test.py")
@ -272,15 +249,23 @@ echo ""
    echo "=== Running all collected tests in a single pytest command ==="
    echo "Total test files: $(echo $TEST_FILES | wc -w)"
    PYTEST_TARGET="$TEST_FILES"
 else
    PYTEST_TARGET="tests/integration/"
    EXTRA_PARAMS="$EXTRA_PARAMS --suite=$TEST_SUITE"
 fi
 set +e
-pytest -s -v $TEST_FILES \
+set -x
 pytest -s -v $PYTEST_TARGET \
    --stack-config="$STACK_CONFIG" \
    --inference-mode="$INFERENCE_MODE" \
    -k "$PYTEST_PATTERN" \
-    --text-model="$TEXT_MODEL" \
+    $EXTRA_PARAMS \
-    --embedding-model=sentence-transformers/all-MiniLM-L6-v2 \
+    --color=yes \
    --color=yes $EXTRA_PARAMS \
    --capture=tee-sys
 exit_code=$?
 set +x
 set -e
 if [ $exit_code -eq 0 ]; then
--- a/tests/README.md
+++ b/tests/README.md
@ -38,26 +38,15 @@ For running integration tests, you must provide a few things:
  - a distribution name (e.g., `starter`) or a path to a `run.yaml` file
  - a comma-separated list of api=provider pairs, e.g. `inference=fireworks,safety=llama-guard,agents=meta-reference`. This is most useful for testing a single API surface.
 - Whether you are using replay or live mode for inference. This is specified with the LLAMA_STACK_TEST_INFERENCE_MODE environment variable. The default mode currently is "live" -- that is certainly surprising, but we will fix this soon.
 - Any API keys you need to use should be set in the environment, or can be passed in with the --env option.
 You can run the integration tests in replay mode with:
 ```bash
 # Run all tests with existing recordings
 LLAMA_STACK_TEST_INFERENCE_MODE=replay \
  LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
  uv run --group test \
  pytest -sv tests/integration/ --stack-config=starter
 ```
 If you don't specify LLAMA_STACK_TEST_INFERENCE_MODE, by default it will be in "live" mode -- that is, it will make real API calls.
 ```bash
 # Test against live APIs
 FIREWORKS_API_KEY=your_key pytest -sv tests/integration/inference --stack-config=starter
 ```
 ### Re-recording tests
 #### Local Re-recording (Manual Setup Required)
@ -66,7 +55,6 @@ If you want to re-record tests locally, you can do so with:
 ```bash
 LLAMA_STACK_TEST_INFERENCE_MODE=record \
  LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
  uv run --group test \
  pytest -sv tests/integration/ --stack-config=starter -k "<appropriate test name>"
 ```
@ -89,7 +77,7 @@ You must be careful when re-recording. CI workflows assume a specific setup for
 ./scripts/github/schedule-record-workflow.sh --test-subdirs "agents,inference"
 # Record with vision tests enabled
-./scripts/github/schedule-record-workflow.sh --test-subdirs "inference" --run-vision-tests
+./scripts/github/schedule-record-workflow.sh --test-suite vision
 # Record with specific provider
 ./scripts/github/schedule-record-workflow.sh --test-subdirs "agents" --test-provider vllm
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@ -6,8 +6,6 @@ Integration tests verify complete workflows across different providers using Lla
 ```bash
 # Run all integration tests with existing recordings
 LLAMA_STACK_TEST_INFERENCE_MODE=replay \
  LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
 uv run --group test \
  pytest -sv tests/integration/ --stack-config=starter
 ```
@ -42,6 +40,37 @@ Model parameters can be influenced by the following options:
 Each of these are comma-separated lists and can be used to generate multiple parameter combinations. Note that tests will be skipped
 if no model is specified.
 ### Suites and Setups
 - `--suite`: single named suite that narrows which tests are collected.
 - Available suites:
  - `base`: collects most tests (excludes responses and post_training)
  - `responses`: collects tests under `tests/integration/responses` (needs strong tool-calling models)
  - `vision`: collects only `tests/integration/inference/test_vision_inference.py`
 - `--setup`: global configuration that can be used with any suite. Setups prefill model/env defaults; explicit CLI flags always win.
  - Available setups:
    - `ollama`: Local Ollama provider with lightweight models (sets OLLAMA_URL, uses llama3.2:3b-instruct-fp16)
    - `vllm`: VLLM provider for efficient local inference (sets VLLM_URL, uses Llama-3.2-1B-Instruct)
    - `gpt`: OpenAI GPT models for high-quality responses (uses gpt-4o)
    - `claude`: Anthropic Claude models for high-quality responses (uses claude-3-5-sonnet)
 Examples
 ```bash
 # Fast responses run with a strong tool-calling model
 pytest -s -v tests/integration --stack-config=server:starter --suite=responses --setup=gpt
 # Fast single-file vision run with Ollama defaults
 pytest -s -v tests/integration --stack-config=server:starter --suite=vision --setup=ollama
 # Base suite with VLLM for performance
 pytest -s -v tests/integration --stack-config=server:starter --suite=base --setup=vllm
 # Override a default from setup
 pytest -s -v tests/integration --stack-config=server:starter \
  --suite=responses --setup=gpt --embedding-model=text-embedding-3-small
 ```
 ## Examples
 ### Testing against a Server
@ -98,29 +127,24 @@ pytest -s -v tests/integration/vector_io/ \
 The testing system supports three modes controlled by environment variables:
-### LIVE Mode (Default)
+### REPLAY Mode (Default)
-Tests make real API calls:
+Uses cached responses instead of making API calls:
 ```bash
-LLAMA_STACK_TEST_INFERENCE_MODE=live pytest tests/integration/
+pytest tests/integration/
 ```
 ### RECORD Mode
 Captures API interactions for later replay:
 ```bash
-LLAMA_STACK_TEST_INFERENCE_MODE=record \
+pytest tests/integration/inference/test_new_feature.py --inference-mode=record
 LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
 pytest tests/integration/inference/test_new_feature.py
 ```
-### REPLAY Mode
+### LIVE Mode
-Uses cached responses instead of making API calls:
+Tests make real API calls (but not recorded):
 ```bash
-LLAMA_STACK_TEST_INFERENCE_MODE=replay \
+pytest tests/integration/ --inference-mode=live
 LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
 pytest tests/integration/
 ```
-Note that right now you must specify the recording directory. This is because different tests use different recording directories and we don't (yet) have a fool-proof way to map a test to a recording directory. We are working on this.
+By default, the recording directory is `tests/integration/recordings`. You can override this by setting the `LLAMA_STACK_TEST_RECORDING_DIR` environment variable.
 ## Managing Recordings
@ -138,16 +162,14 @@ cat recordings/responses/abc123.json | jq '.'
 #### Remote Re-recording (Recommended)
 Use the automated workflow script for easier re-recording:
 ```bash
-./scripts/github/schedule-record-workflow.sh --test-subdirs "inference,agents"
+./scripts/github/schedule-record-workflow.sh --subdirs "inference,agents"
 ```
 See the [main testing guide](../README.md#remote-re-recording-recommended) for full details.
 #### Local Re-recording
 ```bash
 # Re-record specific tests
-LLAMA_STACK_TEST_INFERENCE_MODE=record \
+pytest -s -v --stack-config=server:starter tests/integration/inference/test_modified.py --inference-mode=record
 LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
 pytest -s -v --stack-config=server:starter tests/integration/inference/test_modified.py
 ```
 Note that when re-recording tests, you must use a Stack pointing to a server (i.e., `server:starter`). This subtlety exists because the set of tests run in server are a superset of the set of tests run in the library client.
--- a/tests/integration/batches/test_batches.py
+++ b/tests/integration/batches/test_batches.py
@ -268,3 +268,58 @@ class TestBatchesIntegration:
        deleted_error_file = openai_client.files.delete(final_batch.error_file_id)
        assert deleted_error_file.deleted, f"Error file {final_batch.error_file_id} was not deleted successfully"
    def test_batch_e2e_completions(self, openai_client, batch_helper, text_model_id):
        """Run an end-to-end batch with a single successful text completion request."""
        request_body = {"model": text_model_id, "prompt": "Say completions", "max_tokens": 20}
        batch_requests = [
            {
                "custom_id": "success-1",
                "method": "POST",
                "url": "/v1/completions",
                "body": request_body,
            }
        ]
        with batch_helper.create_file(batch_requests) as uploaded_file:
            batch = openai_client.batches.create(
                input_file_id=uploaded_file.id,
                endpoint="/v1/completions",
                completion_window="24h",
                metadata={"test": "e2e_completions_success"},
            )
            final_batch = batch_helper.wait_for(
                batch.id,
                max_wait_time=3 * 60,
                expected_statuses={"completed"},
                timeout_action="skip",
            )
        assert final_batch.status == "completed"
        assert final_batch.request_counts is not None
        assert final_batch.request_counts.total == 1
        assert final_batch.request_counts.completed == 1
        assert final_batch.output_file_id is not None
        output_content = openai_client.files.content(final_batch.output_file_id)
        if isinstance(output_content, str):
            output_text = output_content
        else:
            output_text = output_content.content.decode("utf-8")
        output_lines = output_text.strip().split("\n")
        assert len(output_lines) == 1
        result = json.loads(output_lines[0])
        assert result["custom_id"] == "success-1"
        assert "response" in result
        assert result["response"]["status_code"] == 200
        deleted_output_file = openai_client.files.delete(final_batch.output_file_id)
        assert deleted_output_file.deleted
        if final_batch.error_file_id is not None:
            deleted_error_file = openai_client.files.delete(final_batch.error_file_id)
            assert deleted_error_file.deleted
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@ -6,15 +6,17 @@
 import inspect
 import itertools
 import os
 import platform
 import textwrap
 import time
 from pathlib import Path
 import pytest
 from dotenv import load_dotenv
 from llama_stack.log import get_logger
 from .suites import SETUP_DEFINITIONS, SUITE_DEFINITIONS
 logger = get_logger(__name__, category="tests")
@ -30,6 +32,8 @@ def pytest_runtest_makereport(item, call):
 def pytest_sessionstart(session):
    # stop macOS from complaining about duplicate OpenMP libraries
    os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
    if "LLAMA_STACK_TEST_INFERENCE_MODE" not in os.environ:
        os.environ["LLAMA_STACK_TEST_INFERENCE_MODE"] = "replay"
 def pytest_runtest_teardown(item):
@ -59,9 +63,36 @@ def pytest_configure(config):
        key, value = env_var.split("=", 1)
        os.environ[key] = value
-    if platform.system() == "Darwin":  # Darwin is the system name for macOS
+    inference_mode = config.getoption("--inference-mode")
-        os.environ["DISABLE_CODE_SANDBOX"] = "1"
+    os.environ["LLAMA_STACK_TEST_INFERENCE_MODE"] = inference_mode
-        logger.info("Setting DISABLE_CODE_SANDBOX=1 for macOS")
+
    suite = config.getoption("--suite")
    if suite:
        if suite not in SUITE_DEFINITIONS:
            raise pytest.UsageError(f"Unknown suite: {suite}. Available: {', '.join(sorted(SUITE_DEFINITIONS.keys()))}")
    # Apply setups (global parameterizations): env + defaults
    setup = config.getoption("--setup")
    if suite and not setup:
        setup = SUITE_DEFINITIONS[suite].default_setup
    if setup:
        if setup not in SETUP_DEFINITIONS:
            raise pytest.UsageError(
                f"Unknown setup '{setup}'. Available: {', '.join(sorted(SETUP_DEFINITIONS.keys()))}"
            )
        setup_obj = SETUP_DEFINITIONS[setup]
        logger.info(f"Applying setup '{setup}'{' for suite ' + suite if suite else ''}")
        # Apply env first
        for k, v in setup_obj.env.items():
            if k not in os.environ:
                os.environ[k] = str(v)
        # Apply defaults if not provided explicitly
        for dest, value in setup_obj.defaults.items():
            current = getattr(config.option, dest, None)
            if not current:
                setattr(config.option, dest, value)
 def pytest_addoption(parser):
@ -103,16 +134,32 @@ def pytest_addoption(parser):
        default=384,
        help="Output dimensionality of the embedding model to use for testing. Default: 384",
    )
    parser.addoption(
-        "--record-responses",
+        "--inference-mode",
-        action="store_true",
+        help="Inference mode: { record, replay, live } (default: replay)",
-        help="Record new API responses instead of using cached ones.",
+        choices=["record", "replay", "live"],
        default="replay",
    )
    parser.addoption(
        "--report",
        help="Path where the test report should be written, e.g. --report=/path/to/report.md",
    )
    available_suites = ", ".join(sorted(SUITE_DEFINITIONS.keys()))
    suite_help = (
        f"Single test suite to run (narrows collection). Available: {available_suites}. Example: --suite=responses"
    )
    parser.addoption("--suite", help=suite_help)
    # Global setups for any suite
    available_setups = ", ".join(sorted(SETUP_DEFINITIONS.keys()))
    setup_help = (
        f"Global test setup configuration. Available: {available_setups}. "
        "Can be used with any suite. Example: --setup=ollama"
    )
    parser.addoption("--setup", help=setup_help)
 MODEL_SHORT_IDS = {
    "meta-llama/Llama-3.2-3B-Instruct": "3B",
@ -195,3 +242,36 @@ def pytest_generate_tests(metafunc):
 pytest_plugins = ["tests.integration.fixtures.common"]
 def pytest_ignore_collect(path: str, config: pytest.Config) -> bool:
    """Skip collecting paths outside the selected suite roots for speed."""
    suite = config.getoption("--suite")
    if not suite:
        return False
    sobj = SUITE_DEFINITIONS.get(suite)
    roots: list[str] = sobj.get("roots", []) if isinstance(sobj, dict) else getattr(sobj, "roots", [])
    if not roots:
        return False
    p = Path(str(path)).resolve()
    # Only constrain within tests/integration to avoid ignoring unrelated tests
    integration_root = (Path(str(config.rootpath)) / "tests" / "integration").resolve()
    if not p.is_relative_to(integration_root):
        return False
    for r in roots:
        rp = (Path(str(config.rootpath)) / r).resolve()
        if rp.is_file():
            # Allow the exact file and any ancestor directories so pytest can walk into it.
            if p == rp:
                return False
            if p.is_dir() and rp.is_relative_to(p):
                return False
        else:
            # Allow anything inside an allowed directory
            if p.is_relative_to(rp):
                return False
    return True
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@ -5,6 +5,8 @@
 # the root directory of this source tree.
 import time
 import pytest
 from ..test_cases.test_case import TestCase
@ -35,6 +37,11 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
        "remote::sambanova",
        "remote::tgi",
        "remote::vertexai",
        # {"error":{"message":"Unknown request URL: GET /openai/v1/completions. Please check the URL for typos,
        # or see the docs at https://console.groq.com/docs/","type":"invalid_request_error","code":"unknown_url"}}
        "remote::groq",
        "remote::gemini",  # https://generativelanguage.googleapis.com/v1beta/openai/completions -> 404
        "remote::anthropic",  # at least claude-3-{5,7}-{haiku,sonnet}-* / claude-{sonnet,opus}-4-* are not supported
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
@ -56,6 +63,26 @@ def skip_if_model_doesnt_support_suffix(client_with_models, model_id):
        pytest.skip(f"Provider {provider.provider_type} doesn't support suffix.")
 def skip_if_doesnt_support_n(client_with_models, model_id):
    provider = provider_from_model(client_with_models, model_id)
    if provider.provider_type in (
        "remote::sambanova",
        "remote::ollama",
        # https://console.groq.com/docs/openai#currently-unsupported-openai-features
        # -> Error code: 400 - {'error': {'message': "'n' : number must be at most 1", 'type': 'invalid_request_error'}}
        "remote::groq",
        # Error code: 400 - [{'error': {'code': 400, 'message': 'Only one candidate can be specified in the
        # current model', 'status': 'INVALID_ARGUMENT'}}]
        "remote::gemini",
        # https://docs.anthropic.com/en/api/openai-sdk#simple-fields
        "remote::anthropic",
        "remote::vertexai",
        #  Error code: 400 - [{'error': {'code': 400, 'message': 'Unable to submit request because candidateCount must be 1 but
        #  the entered value was 2. Update the candidateCount value and try again.', 'status': 'INVALID_ARGUMENT'}
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support n param.")
 def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id):
    provider = provider_from_model(client_with_models, model_id)
    if provider.provider_type in (
@ -260,10 +287,7 @@ def test_openai_chat_completion_streaming(compat_client, client_with_models, tex
 )
 def test_openai_chat_completion_streaming_with_n(compat_client, client_with_models, text_model_id, test_case):
    skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
-
+    skip_if_doesnt_support_n(client_with_models, text_model_id)
    provider = provider_from_model(client_with_models, text_model_id)
    if provider.provider_type == "remote::ollama":
        pytest.skip(f"Model {text_model_id} hosted by {provider.provider_type} doesn't support n > 1.")
    tc = TestCase(test_case)
    question = tc["question"]
@ -323,8 +347,15 @@ def test_inference_store(compat_client, client_with_models, text_model_id, strea
        response_id = response.id
        content = response.choices[0].message.content
    tries = 0
    while tries < 10:
        responses = client.chat.completions.list(limit=1000)
-    assert response_id in [r.id for r in responses.data]
+        if response_id in [r.id for r in responses.data]:
            break
        else:
            tries += 1
            time.sleep(0.1)
    assert tries < 10, f"Response {response_id} not found after 1 second"
    retrieved_response = client.chat.completions.retrieve(response_id)
    assert retrieved_response.id == response_id
@ -388,6 +419,18 @@ def test_inference_store_tool_calls(compat_client, client_with_models, text_mode
        response_id = response.id
        content = response.choices[0].message.content
    # wait for the response to be stored
    tries = 0
    while tries < 10:
        responses = client.chat.completions.list(limit=1000)
        if response_id in [r.id for r in responses.data]:
            break
        else:
            tries += 1
            time.sleep(0.1)
    assert tries < 10, f"Response {response_id} not found after 1 second"
    responses = client.chat.completions.list(limit=1000)
    assert response_id in [r.id for r in responses.data]
--- a/tests/integration/recordings/responses/00ba04f74a96.json
+++ b/tests/integration/recordings/responses/00ba04f74a96.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:12:53.860911Z",
+        "created_at": "2025-09-03T17:37:35.23084Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 249137667,
+        "total_duration": 195981375,
-        "load_duration": 152509542,
+        "load_duration": 110522917,
        "prompt_eval_count": 216,
-        "prompt_eval_duration": 71000000,
+        "prompt_eval_duration": 72393958,
        "eval_count": 2,
-        "eval_duration": 24000000,
+        "eval_duration": 11843000,
        "response": "safe",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/04172112ffbb.json
+++ b/tests/integration/recordings/responses/04172112ffbb.json
@ -21,7 +21,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:18.033900164Z",
+          "created_at": "2025-09-03T17:41:43.950283Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -39,7 +39,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:18.213371151Z",
+          "created_at": "2025-09-03T17:41:43.991122Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -57,7 +57,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:18.387513976Z",
+          "created_at": "2025-09-03T17:41:44.031378Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -75,7 +75,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:18.564344287Z",
+          "created_at": "2025-09-03T17:41:44.073098Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -93,7 +93,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:18.746579415Z",
+          "created_at": "2025-09-03T17:41:44.115961Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -111,7 +111,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:18.923276047Z",
+          "created_at": "2025-09-03T17:41:44.156517Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -129,7 +129,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:19.099961963Z",
+          "created_at": "2025-09-03T17:41:44.197079Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -147,7 +147,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:19.275621884Z",
+          "created_at": "2025-09-03T17:41:44.237565Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -165,7 +165,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:19.452204196Z",
+          "created_at": "2025-09-03T17:41:44.277755Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -183,7 +183,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:19.626937514Z",
+          "created_at": "2025-09-03T17:41:44.318476Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -201,7 +201,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:19.805566767Z",
+          "created_at": "2025-09-03T17:41:44.358628Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -219,7 +219,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:19.985987477Z",
+          "created_at": "2025-09-03T17:41:44.398984Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -237,7 +237,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:20.166458601Z",
+          "created_at": "2025-09-03T17:41:44.439232Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -255,7 +255,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:20.343346795Z",
+          "created_at": "2025-09-03T17:41:44.479478Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -273,7 +273,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:20.525008091Z",
+          "created_at": "2025-09-03T17:41:44.520202Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -291,7 +291,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:20.709087695Z",
+          "created_at": "2025-09-03T17:41:44.560517Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -309,7 +309,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:20.887074305Z",
+          "created_at": "2025-09-03T17:41:44.601592Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -327,15 +327,15 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:21.065244925Z",
+          "created_at": "2025-09-03T17:41:44.642064Z",
          "done": true,
          "done_reason": "stop",
-          "total_duration": 4373531496,
+          "total_duration": 887142667,
-          "load_duration": 44438132,
+          "load_duration": 119331417,
          "prompt_eval_count": 56,
-          "prompt_eval_duration": 1296273199,
+          "prompt_eval_duration": 74294709,
          "eval_count": 18,
-          "eval_duration": 3032321735,
+          "eval_duration": 692842791,
          "response": "",
          "thinking": null,
          "context": null
--- a/tests/integration/recordings/responses/0b27fd737699.json
+++ b/tests/integration/recordings/responses/0b27fd737699.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:13:57.556416Z",
+        "created_at": "2025-09-03T17:37:47.461886Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 432363250,
+        "total_duration": 338927833,
-        "load_duration": 159296417,
+        "load_duration": 100895125,
        "prompt_eval_count": 223,
-        "prompt_eval_duration": 257000000,
+        "prompt_eval_duration": 221583042,
        "eval_count": 2,
-        "eval_duration": 14000000,
+        "eval_duration": 12341416,
        "response": "safe",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/0b3f2e4754ff.json
+++ b/tests/integration/recordings/responses/0b3f2e4754ff.json
@ -24,7 +24,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -39,7 +39,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921333,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -50,7 +50,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -65,7 +65,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921333,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -76,7 +76,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -91,7 +91,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921333,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -102,7 +102,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -117,7 +117,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921333,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -128,7 +128,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -143,7 +143,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921334,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -154,7 +154,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -169,7 +169,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921334,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -180,7 +180,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -195,7 +195,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921334,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -206,7 +206,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -221,7 +221,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921334,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
--- a/tests/integration/recordings/responses/0e8f2b001dd9.json
+++ b/tests/integration/recordings/responses/0e8f2b001dd9.json
@ -1,7 +1,7 @@
 {
  "request": {
    "method": "POST",
-    "url": "http://localhost:11434/v1/v1/chat/completions",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
@ -20,14 +20,14 @@
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
-        "id": "chatcmpl-368",
+        "id": "chatcmpl-161",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
-              "content": "Saturn is known for its extensive ring system.",
+              "content": "The answer is Saturn.",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
@ -37,15 +37,15 @@
            }
          }
        ],
-        "created": 1754081853,
+        "created": 1756921364,
        "model": "llama3.2:3b-instruct-fp16",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": "fp_ollama",
        "usage": {
-          "completion_tokens": 11,
+          "completion_tokens": 6,
          "prompt_tokens": 39,
-          "total_tokens": 50,
+          "total_tokens": 45,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
--- a/tests/integration/recordings/responses/10eea8c15ddc.json
+++ b/tests/integration/recordings/responses/10eea8c15ddc.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:12:51.682357Z",
+        "created_at": "2025-09-03T17:37:33.473237Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 238161000,
+        "total_duration": 279025042,
-        "load_duration": 72494750,
+        "load_duration": 162673250,
        "prompt_eval_count": 212,
-        "prompt_eval_duration": 87000000,
+        "prompt_eval_duration": 73595834,
        "eval_count": 5,
-        "eval_duration": 74000000,
+        "eval_duration": 41950291,
        "response": "unsafe\nS8",
        "thinking": null,
        "context": null
--- a/Show more
+++ b/Show more