Merge branch 'main' into content-extension

2025-10-04 20:14:13 +00:00 · 2025-08-25 14:22:15 -06:00 · 2025-08-25 14:22:15 -06:00 · 3e11e1472c
commit 3e11e1472c
parent 84a26339c8 eed25fc6e4
334 changed files with 22841 additions and 8940 deletions
--- a/.github/TRIAGERS.md
+++ b/.github/TRIAGERS.md
@ -1,2 +1,2 @@
 # This file documents Triage members in the Llama Stack community
- @bbrowning @franciscojavierarceo @leseb
+ @franciscojavierarceo
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -2,9 +2,13 @@ name: 'Run and Record Tests'
 description: 'Run integration tests and handle recording/artifact upload'
 inputs:
-  test-types:
+  test-subdirs:
-    description: 'JSON array of test types to run'
+    description: 'Comma-separated list of test subdirectories to run'
    required: true
  test-pattern:
    description: 'Regex pattern to pass to pytest -k'
    required: false
    default: ''
  stack-config:
    description: 'Stack configuration to use'
    required: true
@ -32,12 +36,14 @@ runs:
    - name: Run Integration Tests
      shell: bash
      run: |
-        ./scripts/integration-tests.sh \
+        uv run --no-sync ./scripts/integration-tests.sh \
          --stack-config '${{ inputs.stack-config }}' \
          --provider '${{ inputs.provider }}' \
-          --test-types '${{ inputs.test-types }}' \
+          --test-subdirs '${{ inputs.test-subdirs }}' \
          --test-pattern '${{ inputs.test-pattern }}' \
          --inference-mode '${{ inputs.inference-mode }}' \
-          ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }}
+          ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} \
          | tee pytest-${{ inputs.inference-mode }}.log
    - name: Commit and push recordings
@ -57,10 +63,10 @@ runs:
            git commit -m "Recordings update from CI"
          fi
-          git fetch origin ${{ github.event.pull_request.head.ref }}
+          git fetch origin ${{ github.ref_name }}
-          git rebase origin/${{ github.event.pull_request.head.ref }}
+          git rebase origin/${{ github.ref_name }}
          echo "Rebased successfully"
-          git push origin HEAD:${{ github.event.pull_request.head.ref }}
+          git push origin HEAD:${{ github.ref_name }}
          echo "Pushed successfully"
        else
          echo "No recording changes"
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -16,14 +16,16 @@ runs:
      uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
      with:
        python-version: ${{ inputs.python-version }}
        activate-environment: true
        version: 0.7.6
    - name: Install dependencies
      shell: bash
      run: |
        echo "Updating project dependencies via uv sync"
        uv sync --all-groups
-        uv pip install ollama faiss-cpu
+
        echo "Installing ad-hoc dependencies"
        uv pip install faiss-cpu
        # Install llama-stack-client-python based on the client-version input
        if [ "${{ inputs.client-version }}" = "latest" ]; then
@ -37,4 +39,5 @@ runs:
          exit 1
        fi
-        uv pip install -e .
+        echo "Installed llama packages"
        uv pip list | grep llama
--- a/.github/actions/setup-test-environment/action.yml
+++ b/.github/actions/setup-test-environment/action.yml
@ -53,7 +53,22 @@ runs:
    - name: Build Llama Stack
      shell: bash
      run: |
-        uv run llama stack build --template ci-tests --image-type venv
+        # Install llama-stack-client-python based on the client-version input
        if [ "${{ inputs.client-version }}" = "latest" ]; then
          echo "Installing latest llama-stack-client-python from main branch"
          export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@main
        elif [ "${{ inputs.client-version }}" = "published" ]; then
          echo "Installing published llama-stack-client-python from PyPI"
          unset LLAMA_STACK_CLIENT_DIR
        else
          echo "Invalid client-version: ${{ inputs.client-version }}"
          exit 1
        fi
        echo "Building Llama Stack"
        LLAMA_STACK_DIR=. \
          uv run --no-sync llama stack build --template ci-tests --image-type venv
    - name: Configure git for commits
      shell: bash
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -9,6 +9,7 @@ updates:
      day: "saturday"
    commit-message:
      prefix: chore(github-deps)
  - package-ecosystem: "uv"
    directory: "/"
    schedule:
@ -19,3 +20,14 @@ updates:
      - python
    commit-message:
      prefix: chore(python-deps)
  - package-ecosystem: npm
    directory: "/llama_stack/ui"
    schedule:
      interval: "weekly"
      day: "saturday"
    labels:
      - type/dependencies
      - javascript
    commit-message:
      prefix: chore(ui-deps)
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -18,5 +18,6 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action |
 | Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module |
 | Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |
 | UI Tests | [ui-unit-tests.yml](ui-unit-tests.yml) | Run the UI test suite |
 | Unit Tests | [unit-tests.yml](unit-tests.yml) | Run the unit test suite |
 | Update ReadTheDocs | [update-readthedocs.yml](update-readthedocs.yml) | Update the Llama Stack ReadTheDocs site |
--- a/.github/workflows/changelog.yml
+++ b/.github/workflows/changelog.yml
@ -17,7 +17,7 @@ jobs:
      pull-requests: write  # for peter-evans/create-pull-request to create a PR
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          ref: main
          fetch-depth: 0
--- a/.github/workflows/install-script-ci.yml
+++ b/.github/workflows/install-script-ci.yml
@ -16,21 +16,22 @@ jobs:
  lint:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # 5.0.0
      - name: Run ShellCheck on install.sh
        run: shellcheck scripts/install.sh
  smoke-test-on-dev:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Build a single provider
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template starter --image-type container --image-name test
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync \
            llama stack build --template starter --image-type container --image-name test
      - name: Run installer end-to-end
        run: |
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -10,6 +10,7 @@ on:
    paths:
      - 'distributions/**'
      - 'llama_stack/**'
      - '!llama_stack/ui/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -17,7 +18,7 @@ on:
      - '.github/workflows/integration-auth-tests.yml' # This workflow
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
  cancel-in-progress: true
 jobs:
@ -30,7 +31,7 @@ jobs:
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.github/workflows/integration-sql-store-tests.yml
+++ b/.github/workflows/integration-sql-store-tests.yml
@ -16,7 +16,7 @@ on:
      - '.github/workflows/integration-sql-store-tests.yml' # This workflow
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
  cancel-in-progress: true
 jobs:
@ -44,7 +44,7 @@ jobs:
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -5,11 +5,12 @@ run-name: Run the integration test suite from tests/integration in replay mode
 on:
  push:
    branches: [ main ]
-  pull_request_target:
+  pull_request:
    branches: [ main ]
    types: [opened, synchronize, reopened]
    paths:
      - 'llama_stack/**'
      - '!llama_stack/ui/**'
      - 'tests/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -31,35 +32,23 @@ on:
        description: 'Test against a specific provider'
        type: string
        default: 'ollama'
      test-subdirs:
        description: 'Comma-separated list of test subdirectories to run'
        type: string
        default: ''
      test-pattern:
        description: 'Regex pattern to pass to pytest -k'
        type: string
        default: ''
 concurrency:
  # Skip concurrency for pushes to main - each commit should be tested independently
-  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.event.pull_request.number }}
+  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
  cancel-in-progress: true
 jobs:
  discover-tests:
    runs-on: ubuntu-latest
    outputs:
      test-types: ${{ steps.generate-test-types.outputs.test-types }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Generate test types
        id: generate-test-types
        run: |
          # Get test directories dynamically, excluding non-test directories
          # NOTE: we are excluding post_training since the tests take too long
          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d |
            sed 's|tests/integration/||' |
            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" |
            sort | jq -R -s -c 'split("\n")[:-1]')
          echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
  run-replay-mode-tests:
    needs: discover-tests
    runs-on: ubuntu-latest
    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}
@ -76,7 +65,7 @@ jobs:
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Setup test environment
        uses: ./.github/actions/setup-test-environment
@ -90,7 +79,8 @@ jobs:
      - name: Run tests
        uses: ./.github/actions/run-and-record-tests
        with:
-          test-types: ${{ needs.discover-tests.outputs.test-types }}
+          test-subdirs: ${{ inputs.test-subdirs }}
          test-pattern: ${{ inputs.test-pattern }}
          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
          provider: ${{ matrix.provider }}
          inference-mode: 'replay'
--- a/.github/workflows/integration-vector-io-tests.yml
+++ b/.github/workflows/integration-vector-io-tests.yml
@ -9,14 +9,17 @@ on:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
      - '!llama_stack/ui/**'
      - 'tests/integration/vector_io/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/integration-vector-io-tests.yml' # This workflow
  schedule:
    - cron: '0 0 * * *'  # (test on python 3.13) Daily at 12 AM UTC
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
  cancel-in-progress: true
 jobs:
@ -25,12 +28,12 @@ jobs:
    strategy:
      matrix:
        vector-io-provider: ["inline::faiss", "inline::sqlite-vec", "inline::milvus", "remote::chromadb", "remote::pgvector", "remote::weaviate", "remote::qdrant"]
-        python-version: ["3.12", "3.13"]
+        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
      fail-fast: false # we want to run all tests regardless of failure
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -141,7 +144,7 @@ jobs:
      - name: Build Llama Stack
        run: |
-          uv run llama stack build --template ci-tests --image-type venv
+          uv run --no-sync llama stack build --template ci-tests --image-type venv
      - name: Check Storage and Memory Available Before Tests
        if: ${{ always() }}
@ -164,7 +167,8 @@ jobs:
          ENABLE_WEAVIATE: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'true' || '' }}
          WEAVIATE_CLUSTER_URL: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'localhost:8080' || '' }}
        run: |
-          uv run pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
+          uv run --no-sync \
            pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
            tests/integration/vector_io \
            --embedding-model inline::sentence-transformers/all-MiniLM-L6-v2
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -8,7 +8,7 @@ on:
    branches: [main]
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
  cancel-in-progress: true
 jobs:
@ -20,7 +20,7 @@ jobs:
    steps:
      - name: Checkout code
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          # For dependabot PRs, we need to checkout with a token that can push changes
          token: ${{ github.actor == 'dependabot[bot]' && secrets.GITHUB_TOKEN || github.token }}
@ -36,6 +36,17 @@ jobs:
            **/requirements*.txt
            .pre-commit-config.yaml
      - name: Set up Node.js
        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
        with:
          node-version: '20'
          cache: 'npm'
          cache-dependency-path: 'llama_stack/ui/'
      - name: Install npm dependencies
        run: npm ci
        working-directory: llama_stack/ui
      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
        continue-on-error: true
        env:
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -26,7 +26,7 @@ on:
      - 'pyproject.toml'
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
  cancel-in-progress: true
 jobs:
@ -36,7 +36,7 @@ jobs:
      distros: ${{ steps.set-matrix.outputs.distros }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Generate Distribution List
        id: set-matrix
@ -55,7 +55,7 @@ jobs:
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -79,7 +79,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -92,7 +92,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -106,6 +106,10 @@ jobs:
      - name: Inspect the container image entrypoint
        run: |
          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
          if [ -z "$IMAGE_ID" ]; then
            echo "No image found"
            exit 1
          fi
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
@ -117,7 +121,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -140,6 +144,10 @@ jobs:
      - name: Inspect UBI9 image
        run: |
          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
          if [ -z "$IMAGE_ID" ]; then
            echo "No image found"
            exit 1
          fi
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -9,6 +9,8 @@ on:
  pull_request:
    branches:
      - main
    paths-ignore:
        - 'llama_stack/ui/**'
 jobs:
  build:
@ -19,10 +21,10 @@ jobs:
    steps:
    - name: Checkout repository
-      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
    - name: Install uv
-      uses: astral-sh/setup-uv@e92bafb6253dcd438e0484186d7669ea7a8ca1cc # v6.4.3
+      uses: astral-sh/setup-uv@4959332f0f014c5280e7eac8b70c90cb574c9f9b # v6.6.0
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
--- a/.github/workflows/record-integration-tests.yml
+++ b/.github/workflows/record-integration-tests.yml
@ -1,93 +1,53 @@
 # This workflow should be run manually when needing to re-record tests. This happens when you have
 #  - added a new test
 #  - or changed an existing test such that a new inference call is made
 # You should make a PR and then run this workflow on that PR branch. The workflow will re-record the
 # tests and commit the recordings to the PR branch.
 name: Integration Tests (Record)
 run-name: Run the integration test suite from tests/integration
 on:
  pull_request:
    branches: [ main ]
    types: [opened, synchronize, labeled]
    paths:
      - 'llama_stack/**'
      - 'tests/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - '.github/workflows/record-integration-tests.yml' # This workflow
      - '.github/actions/setup-ollama/action.yml'
      - '.github/actions/setup-test-environment/action.yml'
      - '.github/actions/run-and-record-tests/action.yml'
  workflow_dispatch:
    inputs:
      test-subdirs:
        description: 'Comma-separated list of test subdirectories to run'
        type: string
        default: ''
      test-provider:
        description: 'Test against a specific provider'
        type: string
        default: 'ollama'
-
+      run-vision-tests:
-concurrency:
+        description: 'Whether to run vision tests'
-  group: ${{ github.workflow }}-${{ github.ref }}
+        type: boolean
-  cancel-in-progress: true
+        default: false
      test-pattern:
        description: 'Regex pattern to pass to pytest -k'
        type: string
        default: ''
 jobs:
  discover-tests:
    if: contains(github.event.pull_request.labels.*.name, 're-record-tests') ||
      contains(github.event.pull_request.labels.*.name, 're-record-vision-tests')
    runs-on: ubuntu-latest
    outputs:
      test-types: ${{ steps.generate-test-types.outputs.test-types }}
      matrix-modes: ${{ steps.generate-test-types.outputs.matrix-modes }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Generate test types
        id: generate-test-types
        run: |
          # Get test directories dynamically, excluding non-test directories
          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" |
            sort | jq -R -s -c 'split("\n")[:-1]')
          echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
          labels=$(gh pr view ${{ github.event.pull_request.number }} --json labels --jq '.labels[].name')
          echo "labels=$labels"
          modes_array=()
          if [[ $labels == *"re-record-vision-tests"* ]]; then
            modes_array+=("vision")
          fi
          if [[ $labels == *"re-record-tests"* ]]; then
            modes_array+=("non-vision")
          fi
          # Convert to JSON array
          if [ ${#modes_array[@]} -eq 0 ]; then
            matrix_modes="[]"
          else
            matrix_modes=$(printf '%s\n' "${modes_array[@]}" | jq -R -s -c 'split("\n")[:-1]')
          fi
          echo "matrix_modes=$matrix_modes"
          echo "matrix-modes=$matrix_modes" >> $GITHUB_OUTPUT
        env:
          GH_TOKEN: ${{ github.token }}
  record-tests:
    needs: discover-tests
    runs-on: ubuntu-latest
    permissions:
      contents: write
    strategy:
      fail-fast: false
      matrix:
        mode: ${{ fromJSON(needs.discover-tests.outputs.matrix-modes) }}
    steps:
      - name: Echo workflow inputs
        run: |
          echo "::group::Workflow Inputs"
          echo "test-subdirs: ${{ inputs.test-subdirs }}"
          echo "test-provider: ${{ inputs.test-provider }}"
          echo "run-vision-tests: ${{ inputs.run-vision-tests }}"
          echo "test-pattern: ${{ inputs.test-pattern }}"
          echo "branch: ${{ github.ref_name }}"
          echo "::endgroup::"
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          ref: ${{ github.event.pull_request.head.ref }}
          fetch-depth: 0
      - name: Setup test environment
@ -96,14 +56,15 @@ jobs:
          python-version: "3.12"  # Use single Python version for recording
          client-version: "latest"
          provider: ${{ inputs.test-provider || 'ollama' }}
-          run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
+          run-vision-tests: ${{ inputs.run-vision-tests }}
          inference-mode: 'record'
      - name: Run and record tests
        uses: ./.github/actions/run-and-record-tests
        with:
-          test-types: ${{ needs.discover-tests.outputs.test-types }}
+          test-pattern: ${{ inputs.test-pattern }}
          test-subdirs: ${{ inputs.test-subdirs }}
          stack-config: 'server:ci-tests'  # recording must be done with server since more tests are run
          provider: ${{ inputs.test-provider || 'ollama' }}
          inference-mode: 'record'
-          run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
+          run-vision-tests: ${{ inputs.run-vision-tests }}
--- a/.github/workflows/semantic-pr.yml
+++ b/.github/workflows/semantic-pr.yml
@ -22,6 +22,6 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Check PR Title's semantic conformance
-        uses: amannn/action-semantic-pull-request@0723387faaf9b38adef4775cd42cfd5155ed6017 # v5.5.3
+        uses: amannn/action-semantic-pull-request@48f256284bd46cdaab1048c3721360e808335d50 # v6.1.1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/test-external-provider-module.yml
+++ b/.github/workflows/test-external-provider-module.yml
@ -27,7 +27,7 @@ jobs:
        # container and point 'uv pip install' to the correct path...
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.github/workflows/test-external.yml
+++ b/.github/workflows/test-external.yml
@ -9,6 +9,7 @@ on:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
      - '!llama_stack/ui/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -26,7 +27,7 @@ jobs:
        # container and point 'uv pip install' to the correct path...
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -43,11 +44,11 @@ jobs:
      - name: Print distro dependencies
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/build.yaml --print-deps-only
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml --print-deps-only
      - name: Build distro from config file
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/build.yaml
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml
      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
--- a/.github/workflows/ui-unit-tests.yml
+++ b/.github/workflows/ui-unit-tests.yml
@ -0,0 +1,55 @@
 name: UI Tests
 run-name: Run the UI test suite
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
    paths:
      - 'llama_stack/ui/**'
      - '.github/workflows/ui-unit-tests.yml' # This workflow
  workflow_dispatch:
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
  cancel-in-progress: true
 jobs:
  ui-tests:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        node-version: [22]
    steps:
      - name: Checkout repository
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Setup Node.js
        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
        with:
          node-version: ${{ matrix.node-version }}
          cache: 'npm'
          cache-dependency-path: 'llama_stack/ui/package-lock.json'
      - name: Install dependencies
        working-directory: llama_stack/ui
        run: npm ci
      - name: Run linting
        working-directory: llama_stack/ui
        run: npm run lint
      - name: Run format check
        working-directory: llama_stack/ui
        run: npm run format:check
      - name: Run unit tests
        working-directory: llama_stack/ui
        env:
          CI: true
        run: npm test -- --coverage --watchAll=false --passWithNoTests
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@ -9,6 +9,7 @@ on:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
      - '!llama_stack/ui/**'
      - 'tests/unit/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -17,7 +18,7 @@ on:
  workflow_dispatch:
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
  cancel-in-progress: true
 jobs:
@ -31,7 +32,7 @@ jobs:
          - "3.13"
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.github/workflows/update-readthedocs.yml
+++ b/.github/workflows/update-readthedocs.yml
@ -27,7 +27,7 @@ on:
      - '.github/workflows/update-readthedocs.yml'
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
  cancel-in-progress: true
 jobs:
@ -37,7 +37,7 @@ jobs:
      TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -2,6 +2,7 @@ exclude: 'build/'
 default_language_version:
    python: python3.12
    node: "22"
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
@ -145,6 +146,32 @@ repos:
        pass_filenames: false
        require_serial: true
        files: ^.github/workflows/.*$
      - id: ui-linter
        name: Format & Lint UI
        entry: bash ./scripts/run-ui-linter.sh
        language: system
        files: ^llama_stack/ui/.*\.(ts|tsx)$
        pass_filenames: false
        require_serial: true
      - id: check-log-usage
        name: Ensure 'llama_stack.log' usage for logging
        entry: bash
        language: system
        types: [python]
        pass_filenames: true
        args:
          - -c
          - |
            matches=$(grep -EnH '^[^#]*\b(import\s+logging|from\s+logging\b)' "$@" | grep -v -e '#\s*allow-direct-logging' || true)
            if [ -n "$matches" ]; then
              # GitHub Actions annotation format
              while IFS=: read -r file line_num rest; do
                echo "::error file=$file,line=$line_num::Do not use 'import logging' or 'from logging import' in $file. Use the custom log instead: from llama_stack.log import get_logger; logger = get_logger(). If direct logging is truly needed, add: # allow-direct-logging"
              done <<< "$matches"
              exit 1
            fi
            exit 0
 ci:
    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
--- a/README.md
+++ b/README.md
@ -1,8 +1,5 @@
 # Llama Stack
 <a href="https://trendshift.io/repositories/11824" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11824" alt="meta-llama%2Fllama-stack | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 -----
 [![PyPI version](https://img.shields.io/pypi/v/llama_stack.svg)](https://pypi.org/project/llama_stack/)
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/)
 [![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE)
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -4605,6 +4605,49 @@
                }
            }
        },
        "/v1/inference/rerank": {
            "post": {
                "responses": {
                    "200": {
                        "description": "RerankResponse with indices sorted by relevance score (descending).",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/RerankResponse"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Inference"
                ],
                "description": "Rerank a list of documents based on their relevance to a query.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/RerankRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume": {
            "post": {
                "responses": {
@ -8821,6 +8864,61 @@
                "title": "OpenAIResponseOutputMessageMCPListTools",
                "description": "MCP list tools output message containing available tools from an MCP server."
            },
            "OpenAIResponseContentPart": {
                "oneOf": [
                    {
                        "$ref": "#/components/schemas/OpenAIResponseContentPartOutputText"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseContentPartRefusal"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "output_text": "#/components/schemas/OpenAIResponseContentPartOutputText",
                        "refusal": "#/components/schemas/OpenAIResponseContentPartRefusal"
                    }
                }
            },
            "OpenAIResponseContentPartOutputText": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "output_text",
                        "default": "output_text"
                    },
                    "text": {
                        "type": "string"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "text"
                ],
                "title": "OpenAIResponseContentPartOutputText"
            },
            "OpenAIResponseContentPartRefusal": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "refusal",
                        "default": "refusal"
                    },
                    "refusal": {
                        "type": "string"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "refusal"
                ],
                "title": "OpenAIResponseContentPartRefusal"
            },
            "OpenAIResponseObjectStream": {
                "oneOf": [
                    {
@ -8877,6 +8975,12 @@
                    {
                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
                    }
@ -8902,6 +9006,8 @@
                        "response.mcp_call.in_progress": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress",
                        "response.mcp_call.failed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed",
                        "response.mcp_call.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted",
                        "response.content_part.added": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded",
                        "response.content_part.done": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone",
                        "response.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
                    }
                }
@ -8928,6 +9034,80 @@
                "title": "OpenAIResponseObjectStreamResponseCompleted",
                "description": "Streaming event indicating a response has been completed."
            },
            "OpenAIResponseObjectStreamResponseContentPartAdded": {
                "type": "object",
                "properties": {
                    "response_id": {
                        "type": "string",
                        "description": "Unique identifier of the response containing this content"
                    },
                    "item_id": {
                        "type": "string",
                        "description": "Unique identifier of the output item containing this content part"
                    },
                    "part": {
                        "$ref": "#/components/schemas/OpenAIResponseContentPart",
                        "description": "The content part that was added"
                    },
                    "sequence_number": {
                        "type": "integer",
                        "description": "Sequential number for ordering streaming events"
                    },
                    "type": {
                        "type": "string",
                        "const": "response.content_part.added",
                        "default": "response.content_part.added",
                        "description": "Event type identifier, always \"response.content_part.added\""
                    }
                },
                "additionalProperties": false,
                "required": [
                    "response_id",
                    "item_id",
                    "part",
                    "sequence_number",
                    "type"
                ],
                "title": "OpenAIResponseObjectStreamResponseContentPartAdded",
                "description": "Streaming event for when a new content part is added to a response item."
            },
            "OpenAIResponseObjectStreamResponseContentPartDone": {
                "type": "object",
                "properties": {
                    "response_id": {
                        "type": "string",
                        "description": "Unique identifier of the response containing this content"
                    },
                    "item_id": {
                        "type": "string",
                        "description": "Unique identifier of the output item containing this content part"
                    },
                    "part": {
                        "$ref": "#/components/schemas/OpenAIResponseContentPart",
                        "description": "The completed content part"
                    },
                    "sequence_number": {
                        "type": "integer",
                        "description": "Sequential number for ordering streaming events"
                    },
                    "type": {
                        "type": "string",
                        "const": "response.content_part.done",
                        "default": "response.content_part.done",
                        "description": "Event type identifier, always \"response.content_part.done\""
                    }
                },
                "additionalProperties": false,
                "required": [
                    "response_id",
                    "item_id",
                    "part",
                    "sequence_number",
                    "type"
                ],
                "title": "OpenAIResponseObjectStreamResponseContentPartDone",
                "description": "Streaming event for when a content part is completed."
            },
            "OpenAIResponseObjectStreamResponseCreated": {
                "type": "object",
                "properties": {
@ -14630,7 +14810,8 @@
            "OpenAIFilePurpose": {
                "type": "string",
                "enum": [
-                    "assistants"
+                    "assistants",
                    "batch"
                ],
                "title": "OpenAIFilePurpose",
                "description": "Valid purpose values for OpenAI Files API."
@ -14707,7 +14888,8 @@
                    "purpose": {
                        "type": "string",
                        "enum": [
-                            "assistants"
+                            "assistants",
                            "batch"
                        ],
                        "description": "The intended purpose of the file"
                    }
@ -15926,12 +16108,16 @@
                    "value": {
                        "type": "number",
                        "description": "The numeric value of the metric at this timestamp"
                    },
                    "unit": {
                        "type": "string"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "timestamp",
-                    "value"
+                    "value",
                    "unit"
                ],
                "title": "MetricDataPoint",
                "description": "A single data point in a metric time series."
@ -16489,6 +16675,95 @@
                ],
                "title": "RegisterVectorDbRequest"
            },
            "RerankRequest": {
                "type": "object",
                "properties": {
                    "model": {
                        "type": "string",
                        "description": "The identifier of the reranking model to use."
                    },
                    "query": {
                        "oneOf": [
                            {
                                "type": "string"
                            },
                            {
                                "$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam"
                            },
                            {
                                "$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
                            }
                        ],
                        "description": "The search query to rank items against. Can be a string, text content part, or image content part. The input must not exceed the model's max input token length."
                    },
                    "items": {
                        "type": "array",
                        "items": {
                            "oneOf": [
                                {
                                    "type": "string"
                                },
                                {
                                    "$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam"
                                },
                                {
                                    "$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
                                }
                            ]
                        },
                        "description": "List of items to rerank. Each item can be a string, text content part, or image content part. Each input must not exceed the model's max input token length."
                    },
                    "max_num_results": {
                        "type": "integer",
                        "description": "(Optional) Maximum number of results to return. Default: returns all."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "model",
                    "query",
                    "items"
                ],
                "title": "RerankRequest"
            },
            "RerankData": {
                "type": "object",
                "properties": {
                    "index": {
                        "type": "integer",
                        "description": "The original index of the document in the input list"
                    },
                    "relevance_score": {
                        "type": "number",
                        "description": "The relevance score from the model output. Values are inverted when applicable so that higher scores indicate greater relevance."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "index",
                    "relevance_score"
                ],
                "title": "RerankData",
                "description": "A single rerank result from a reranking response."
            },
            "RerankResponse": {
                "type": "object",
                "properties": {
                    "data": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/RerankData"
                        },
                        "description": "List of rerank result objects, sorted by relevance score (descending)"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "data"
                ],
                "title": "RerankResponse",
                "description": "Response from a reranking request."
            },
            "ResumeAgentTurnRequest": {
                "type": "object",
                "properties": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -3264,6 +3264,37 @@ paths:
            schema:
              $ref: '#/components/schemas/QueryTracesRequest'
        required: true
  /v1/inference/rerank:
    post:
      responses:
        '200':
          description: >-
            RerankResponse with indices sorted by relevance score (descending).
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/RerankResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
      description: >-
        Rerank a list of documents based on their relevance to a query.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/RerankRequest'
        required: true
  /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume:
    post:
      responses:
@ -6441,6 +6472,43 @@ components:
      title: OpenAIResponseOutputMessageMCPListTools
      description: >-
        MCP list tools output message containing available tools from an MCP server.
    OpenAIResponseContentPart:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseContentPartOutputText'
        - $ref: '#/components/schemas/OpenAIResponseContentPartRefusal'
      discriminator:
        propertyName: type
        mapping:
          output_text: '#/components/schemas/OpenAIResponseContentPartOutputText'
          refusal: '#/components/schemas/OpenAIResponseContentPartRefusal'
    OpenAIResponseContentPartOutputText:
      type: object
      properties:
        type:
          type: string
          const: output_text
          default: output_text
        text:
          type: string
      additionalProperties: false
      required:
        - type
        - text
      title: OpenAIResponseContentPartOutputText
    OpenAIResponseContentPartRefusal:
      type: object
      properties:
        type:
          type: string
          const: refusal
          default: refusal
        refusal:
          type: string
      additionalProperties: false
      required:
        - type
        - refusal
      title: OpenAIResponseContentPartRefusal
    OpenAIResponseObjectStream:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
@ -6461,6 +6529,8 @@ components:
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded'
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone'
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
      discriminator:
        propertyName: type
@ -6483,6 +6553,8 @@ components:
          response.mcp_call.in_progress: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
          response.mcp_call.failed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
          response.mcp_call.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
          response.content_part.added: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded'
          response.content_part.done: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone'
          response.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
    "OpenAIResponseObjectStreamResponseCompleted":
      type: object
@ -6504,6 +6576,76 @@ components:
        OpenAIResponseObjectStreamResponseCompleted
      description: >-
        Streaming event indicating a response has been completed.
    "OpenAIResponseObjectStreamResponseContentPartAdded":
      type: object
      properties:
        response_id:
          type: string
          description: >-
            Unique identifier of the response containing this content
        item_id:
          type: string
          description: >-
            Unique identifier of the output item containing this content part
        part:
          $ref: '#/components/schemas/OpenAIResponseContentPart'
          description: The content part that was added
        sequence_number:
          type: integer
          description: >-
            Sequential number for ordering streaming events
        type:
          type: string
          const: response.content_part.added
          default: response.content_part.added
          description: >-
            Event type identifier, always "response.content_part.added"
      additionalProperties: false
      required:
        - response_id
        - item_id
        - part
        - sequence_number
        - type
      title: >-
        OpenAIResponseObjectStreamResponseContentPartAdded
      description: >-
        Streaming event for when a new content part is added to a response item.
    "OpenAIResponseObjectStreamResponseContentPartDone":
      type: object
      properties:
        response_id:
          type: string
          description: >-
            Unique identifier of the response containing this content
        item_id:
          type: string
          description: >-
            Unique identifier of the output item containing this content part
        part:
          $ref: '#/components/schemas/OpenAIResponseContentPart'
          description: The completed content part
        sequence_number:
          type: integer
          description: >-
            Sequential number for ordering streaming events
        type:
          type: string
          const: response.content_part.done
          default: response.content_part.done
          description: >-
            Event type identifier, always "response.content_part.done"
      additionalProperties: false
      required:
        - response_id
        - item_id
        - part
        - sequence_number
        - type
      title: >-
        OpenAIResponseObjectStreamResponseContentPartDone
      description: >-
        Streaming event for when a content part is completed.
    "OpenAIResponseObjectStreamResponseCreated":
      type: object
      properties:
@ -10840,6 +10982,7 @@ components:
      type: string
      enum:
        - assistants
        - batch
      title: OpenAIFilePurpose
      description: >-
        Valid purpose values for OpenAI Files API.
@ -10908,6 +11051,7 @@ components:
          type: string
          enum:
            - assistants
            - batch
          description: The intended purpose of the file
      additionalProperties: false
      required:
@ -11838,10 +11982,13 @@ components:
          type: number
          description: >-
            The numeric value of the metric at this timestamp
        unit:
          type: string
      additionalProperties: false
      required:
        - timestamp
        - value
        - unit
      title: MetricDataPoint
      description: >-
        A single data point in a metric time series.
@ -12252,6 +12399,76 @@ components:
        - vector_db_id
        - embedding_model
      title: RegisterVectorDbRequest
    RerankRequest:
      type: object
      properties:
        model:
          type: string
          description: >-
            The identifier of the reranking model to use.
        query:
          oneOf:
            - type: string
            - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
            - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
          description: >-
            The search query to rank items against. Can be a string, text content
            part, or image content part. The input must not exceed the model's max
            input token length.
        items:
          type: array
          items:
            oneOf:
              - type: string
              - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
              - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
          description: >-
            List of items to rerank. Each item can be a string, text content part,
            or image content part. Each input must not exceed the model's max input
            token length.
        max_num_results:
          type: integer
          description: >-
            (Optional) Maximum number of results to return. Default: returns all.
      additionalProperties: false
      required:
        - model
        - query
        - items
      title: RerankRequest
    RerankData:
      type: object
      properties:
        index:
          type: integer
          description: >-
            The original index of the document in the input list
        relevance_score:
          type: number
          description: >-
            The relevance score from the model output. Values are inverted when applicable
            so that higher scores indicate greater relevance.
      additionalProperties: false
      required:
        - index
        - relevance_score
      title: RerankData
      description: >-
        A single rerank result from a reranking response.
    RerankResponse:
      type: object
      properties:
        data:
          type: array
          items:
            $ref: '#/components/schemas/RerankData'
          description: >-
            List of rerank result objects, sorted by relevance score (descending)
      additionalProperties: false
      required:
        - data
      title: RerankResponse
      description: Response from a reranking request.
    ResumeAgentTurnRequest:
      type: object
      properties:
--- a/docs/source/concepts/apis.md
+++ b/docs/source/concepts/apis.md
@ -18,3 +18,4 @@ We are working on adding a few more APIs to complete the application lifecycle.
 - **Batch Inference**: run inference on a dataset of inputs
 - **Batch Agents**: run agents on a dataset of inputs
 - **Synthetic Data Generation**: generate synthetic data for model development
 - **Batches**: OpenAI-compatible batch management for inference
--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@ -4,11 +4,11 @@
 ## Adding a New Provider
-See the [Adding a New API Provider Page](new_api_provider.md) which describes how to add new API providers to the Stack.
+See:
 - [Adding a New API Provider Page](new_api_provider.md) which describes how to add new API providers to the Stack.
 - [Vector Database Page](new_vector_database.md) which describes how to add a new vector databases with Llama Stack.
 - [External Provider Page](../providers/external/index.md) which describes how to add external providers to the Stack.
 See the [Vector Database Page](new_vector_database.md) which describes how to add a new vector databases with Llama Stack.
 See the [External Provider Page](../providers/external/index.md) which describes how to add external providers to the Stack.
 ```{toctree}
 :maxdepth: 1
 :hidden:
@ -19,11 +19,21 @@ new_vector_database
 ## Testing
-See the [Test Page](testing.md) which describes how to test your changes.
+
 ```{include} ../../../tests/README.md
 ```
 ## Advanced Topics
 For developers who need deeper understanding of the testing system internals:
 ```{toctree}
 :maxdepth: 1
 :hidden:
 :caption: Testing
-testing
+testing/record-replay
-```
+```
 ### Benchmarking
 ```{include} ../../../docs/source/distributions/k8s-benchmark/README.md
 ```
--- a/docs/source/contributing/testing.md
+++ b/docs/source/contributing/testing.md
@ -1,8 +0,0 @@
 ```{include} ../../../tests/README.md
 ```
 ```{include} ../../../tests/unit/README.md
 ```
 ```{include} ../../../tests/integration/README.md
 ```
--- a/docs/source/contributing/testing/record-replay.md
+++ b/docs/source/contributing/testing/record-replay.md
@ -0,0 +1,234 @@
 # Record-Replay System
 Understanding how Llama Stack captures and replays API interactions for testing.
 ## Overview
 The record-replay system solves a fundamental challenge in AI testing: how do you test against expensive, non-deterministic APIs without breaking the bank or dealing with flaky tests?
 The solution: intercept API calls, store real responses, and replay them later. This gives you real API behavior without the cost or variability.
 ## How It Works
 ### Request Hashing
 Every API request gets converted to a deterministic hash for lookup:
 ```python
 def normalize_request(method: str, url: str, headers: dict, body: dict) -> str:
    normalized = {
        "method": method.upper(),
        "endpoint": urlparse(url).path,  # Just the path, not full URL
        "body": body,  # Request parameters
    }
    return hashlib.sha256(json.dumps(normalized, sort_keys=True).encode()).hexdigest()
 ```
 **Key insight:** The hashing is intentionally precise. Different whitespace, float precision, or parameter order produces different hashes. This prevents subtle bugs from false cache hits.
 ```python
 # These produce DIFFERENT hashes:
 {"content": "Hello world"}
 {"content": "Hello   world\n"}
 {"temperature": 0.7}
 {"temperature": 0.7000001}
 ```
 ### Client Interception
 The system patches OpenAI and Ollama client methods to intercept calls before they leave your application. This happens transparently - your test code doesn't change.
 ### Storage Architecture
 Recordings use a two-tier storage system optimized for both speed and debuggability:
 ```
 recordings/
 ├── index.sqlite          # Fast lookup by request hash
 └── responses/
    ├── abc123def456.json  # Individual response files
    └── def789ghi012.json
 ```
 **SQLite index** enables O(log n) hash lookups and metadata queries without loading response bodies.
 **JSON files** store complete request/response pairs in human-readable format for debugging.
 ## Recording Modes
 ### LIVE Mode
 Direct API calls with no recording or replay:
 ```python
 with inference_recording(mode=InferenceMode.LIVE):
    response = await client.chat.completions.create(...)
 ```
 Use for initial development and debugging against real APIs.
 ### RECORD Mode
 Captures API interactions while passing through real responses:
 ```python
 with inference_recording(mode=InferenceMode.RECORD, storage_dir="./recordings"):
    response = await client.chat.completions.create(...)
    # Real API call made, response captured AND returned
 ```
 The recording process:
 1. Request intercepted and hashed
 2. Real API call executed
 3. Response captured and serialized
 4. Recording stored to disk
 5. Original response returned to caller
 ### REPLAY Mode
 Returns stored responses instead of making API calls:
 ```python
 with inference_recording(mode=InferenceMode.REPLAY, storage_dir="./recordings"):
    response = await client.chat.completions.create(...)
    # No API call made, cached response returned instantly
 ```
 The replay process:
 1. Request intercepted and hashed
 2. Hash looked up in SQLite index
 3. Response loaded from JSON file
 4. Response deserialized and returned
 5. Error if no recording found
 ## Streaming Support
 Streaming APIs present a unique challenge: how do you capture an async generator?
 ### The Problem
 ```python
 # How do you record this?
 async for chunk in client.chat.completions.create(stream=True):
    process(chunk)
 ```
 ### The Solution
 The system captures all chunks immediately before yielding any:
 ```python
 async def handle_streaming_record(response):
    # Capture complete stream first
    chunks = []
    async for chunk in response:
        chunks.append(chunk)
    # Store complete recording
    storage.store_recording(
        request_hash, request_data, {"body": chunks, "is_streaming": True}
    )
    # Return generator that replays captured chunks
    async def replay_stream():
        for chunk in chunks:
            yield chunk
    return replay_stream()
 ```
 This ensures:
 - **Complete capture** - The entire stream is saved atomically
 - **Interface preservation** - The returned object behaves like the original API
 - **Deterministic replay** - Same chunks in the same order every time
 ## Serialization
 API responses contain complex Pydantic objects that need careful serialization:
 ```python
 def _serialize_response(response):
    if hasattr(response, "model_dump"):
        # Preserve type information for proper deserialization
        return {
            "__type__": f"{response.__class__.__module__}.{response.__class__.__qualname__}",
            "__data__": response.model_dump(mode="json"),
        }
    return response
 ```
 This preserves type safety - when replayed, you get the same Pydantic objects with all their validation and methods.
 ## Environment Integration
 ### Environment Variables
 Control recording behavior globally:
 ```bash
 export LLAMA_STACK_TEST_INFERENCE_MODE=replay
 export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings
 pytest tests/integration/
 ```
 ### Pytest Integration
 The system integrates automatically based on environment variables, requiring no changes to test code.
 ## Debugging Recordings
 ### Inspecting Storage
 ```bash
 # See what's recorded
 sqlite3 recordings/index.sqlite "SELECT endpoint, model, timestamp FROM recordings LIMIT 10;"
 # View specific response
 cat recordings/responses/abc123def456.json | jq '.response.body'
 # Find recordings by endpoint
 sqlite3 recordings/index.sqlite "SELECT * FROM recordings WHERE endpoint='/v1/chat/completions';"
 ```
 ### Common Issues
 **Hash mismatches:** Request parameters changed slightly between record and replay
 ```bash
 # Compare request details
 cat recordings/responses/abc123.json | jq '.request'
 ```
 **Serialization errors:** Response types changed between versions
 ```bash
 # Re-record with updated types
 rm recordings/responses/failing_hash.json
 LLAMA_STACK_TEST_INFERENCE_MODE=record pytest test_failing.py
 ```
 **Missing recordings:** New test or changed parameters
 ```bash
 # Record the missing interaction
 LLAMA_STACK_TEST_INFERENCE_MODE=record pytest test_new.py
 ```
 ## Design Decisions
 ### Why Not Mocks?
 Traditional mocking breaks down with AI APIs because:
 - Response structures are complex and evolve frequently
 - Streaming behavior is hard to mock correctly
 - Edge cases in real APIs get missed
 - Mocks become brittle maintenance burdens
 ### Why Precise Hashing?
 Loose hashing (normalizing whitespace, rounding floats) seems convenient but hides bugs. If a test changes slightly, you want to know about it rather than accidentally getting the wrong cached response.
 ### Why JSON + SQLite?
 - **JSON** - Human readable, diff-friendly, easy to inspect and modify
 - **SQLite** - Fast indexed lookups without loading response bodies
 - **Hybrid** - Best of both worlds for different use cases
 This system provides reliable, fast testing against real AI APIs while maintaining the ability to debug issues when they arise.
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@ -225,8 +225,32 @@ server:
  port: 8321  # Port to listen on (default: 8321)
  tls_certfile: "/path/to/cert.pem"  # Optional: Path to TLS certificate for HTTPS
  tls_keyfile: "/path/to/key.pem"    # Optional: Path to TLS key for HTTPS
  cors: true  # Optional: Enable CORS (dev mode) or full config object
 ```
 ### CORS Configuration
 CORS (Cross-Origin Resource Sharing) can be configured in two ways:
 **Local development** (allows localhost origins only):
 ```yaml
 server:
  cors: true
 ```
 **Explicit configuration** (custom origins and settings):
 ```yaml
 server:
  cors:
    allow_origins: ["https://myapp.com", "https://app.example.com"]
    allow_methods: ["GET", "POST", "PUT", "DELETE"]
    allow_headers: ["Content-Type", "Authorization"]
    allow_credentials: true
    max_age: 3600
 ```
 When `cors: true`, the server enables secure localhost-only access for local development. For production, specify exact origins to maintain security.
 ### Authentication Configuration
 > **Breaking Change (v0.2.14)**: The authentication configuration structure has changed. The previous format with `provider_type` and `config` fields has been replaced with a unified `provider_config` field that includes the `type` field. Update your configuration files accordingly.
@ -618,6 +642,54 @@ Content-Type: application/json
 }
 ```
 ### CORS Configuration
 Configure CORS to allow web browsers to make requests from different domains. Disabled by default.
 #### Quick Setup
 For development, use the simple boolean flag:
 ```yaml
 server:
  cors: true  # Auto-enables localhost with any port
 ```
 This automatically allows `http://localhost:*` and `https://localhost:*` with secure defaults.
 #### Custom Configuration
 For specific origins and full control:
 ```yaml
 server:
  cors:
    allow_origins: ["https://myapp.com", "https://staging.myapp.com"]
    allow_credentials: true
    allow_methods: ["GET", "POST", "PUT", "DELETE"]
    allow_headers: ["Content-Type", "Authorization"]
    allow_origin_regex: "https://.*\\.example\\.com"  # Optional regex pattern
    expose_headers: ["X-Total-Count"]
    max_age: 86400
 ```
 #### Configuration Options
 | Field                | Description                                    | Default |
 | -------------------- | ---------------------------------------------- | ------- |
 | `allow_origins`      | List of allowed origins. Use `["*"]` for any. | `["*"]` |
 | `allow_origin_regex` | Regex pattern for allowed origins (optional). | `None`  |
 | `allow_methods`      | Allowed HTTP methods.                          | `["*"]` |
 | `allow_headers`      | Allowed headers.                               | `["*"]` |
 | `allow_credentials`  | Allow credentials (cookies, auth headers).    | `false` |
 | `expose_headers`     | Headers exposed to browser.                   | `[]`    |
 | `max_age`            | Preflight cache time (seconds).               | `600`   |
 **Security Notes**:
 - `allow_credentials: true` requires explicit origins (no wildcards)
 - `cors: true` enables localhost access only (secure for development)
 - For public APIs, always specify exact allowed origins
 ## Extending to handle Safety
 Configuring Safety can be a little involved so it is instructive to go through an example.
--- a/docs/source/distributions/importing_as_library.md
+++ b/docs/source/distributions/importing_as_library.md
@ -17,7 +17,6 @@ client = LlamaStackAsLibraryClient(
    # provider_data is optional, but if you need to pass in any provider specific data, you can do so here.
    provider_data={"tavily_search_api_key": os.environ["TAVILY_SEARCH_API_KEY"]},
 )
 client.initialize()
 ```
 This will parse your config and set up any inline implementations and remote clients needed for your implementation.
@ -32,5 +31,4 @@ If you've created a [custom distribution](https://llama-stack.readthedocs.io/en/
 ```python
 client = LlamaStackAsLibraryClient(config_path)
 client.initialize()
 ```
--- a/docs/source/distributions/k8s-benchmark/README.md
+++ b/docs/source/distributions/k8s-benchmark/README.md
@ -0,0 +1,156 @@
 # Llama Stack Benchmark Suite on Kubernetes
 ## Motivation
 Performance benchmarking is critical for understanding the overhead and characteristics of the Llama Stack abstraction layer compared to direct inference engines like vLLM.
 ### Why This Benchmark Suite Exists
 **Performance Validation**: The Llama Stack provides a unified API layer across multiple inference providers, but this abstraction introduces potential overhead. This benchmark suite quantifies the performance impact by comparing:
 - Llama Stack inference (with vLLM backend)
 - Direct vLLM inference calls
 - Both under identical Kubernetes deployment conditions
 **Production Readiness Assessment**: Real-world deployments require understanding performance characteristics under load. This suite simulates concurrent user scenarios with configurable parameters (duration, concurrency, request patterns) to validate production readiness.
 **Regression Detection (TODO)**: As the Llama Stack evolves, this benchmark provides automated regression detection for performance changes. CI/CD pipelines can leverage these benchmarks to catch performance degradations before production deployments.
 **Resource Planning**: By measuring throughput, latency percentiles, and resource utilization patterns, teams can make informed decisions about:
 - Kubernetes resource allocation (CPU, memory, GPU)
 - Auto-scaling configurations
 - Cost optimization strategies
 ### Key Metrics Captured
 The benchmark suite measures critical performance indicators:
 - **Throughput**: Requests per second under sustained load
 - **Latency Distribution**: P50, P95, P99 response times
 - **Time to First Token (TTFT)**: Critical for streaming applications
 - **Error Rates**: Request failures and timeout analysis
 This data enables data-driven architectural decisions and performance optimization efforts.
 ## Setup
 **1. Deploy base k8s infrastructure:**
 ```bash
 cd ../k8s
 ./apply.sh
 ```
 **2. Deploy benchmark components:**
 ```bash
 cd ../k8s-benchmark
 ./apply.sh
 ```
 **3. Verify deployment:**
 ```bash
 kubectl get pods
 # Should see: llama-stack-benchmark-server, vllm-server, etc.
 ```
 ## Quick Start
 ### Basic Benchmarks
 **Benchmark Llama Stack (default):**
 ```bash
 cd docs/source/distributions/k8s-benchmark/
 ./run-benchmark.sh
 ```
 **Benchmark vLLM direct:**
 ```bash
 ./run-benchmark.sh --target vllm
 ```
 ### Custom Configuration
 **Extended benchmark with high concurrency:**
 ```bash
 ./run-benchmark.sh --target vllm --duration 120 --concurrent 20
 ```
 **Short test run:**
 ```bash
 ./run-benchmark.sh --target stack --duration 30 --concurrent 5
 ```
 ## Command Reference
 ### run-benchmark.sh Options
 ```bash
 ./run-benchmark.sh [options]
 Options:
  -t, --target <stack|vllm>     Target to benchmark (default: stack)
  -d, --duration <seconds>      Duration in seconds (default: 60)
  -c, --concurrent <users>      Number of concurrent users (default: 10)
  -h, --help                    Show help message
 Examples:
  ./run-benchmark.sh --target vllm              # Benchmark vLLM direct
  ./run-benchmark.sh --target stack             # Benchmark Llama Stack
  ./run-benchmark.sh -t vllm -d 120 -c 20       # vLLM with 120s, 20 users
 ```
 ## Local Testing
 ### Running Benchmark Locally
 For local development without Kubernetes:
 **1. Start OpenAI mock server:**
 ```bash
 uv run python openai-mock-server.py --port 8080
 ```
 **2. Run benchmark against mock server:**
 ```bash
 uv run python benchmark.py \
  --base-url http://localhost:8080/v1 \
  --model mock-inference \
  --duration 30 \
  --concurrent 5
 ```
 **3. Test against local vLLM server:**
 ```bash
 # If you have vLLM running locally on port 8000
 uv run python benchmark.py \
  --base-url http://localhost:8000/v1 \
  --model meta-llama/Llama-3.2-3B-Instruct \
  --duration 30 \
  --concurrent 5
 ```
 **4. Profile the running server:**
 ```bash
 ./profile_running_server.sh
 ```
 ### OpenAI Mock Server
 The `openai-mock-server.py` provides:
 - **OpenAI-compatible API** for testing without real models
 - **Configurable streaming delay** via `STREAM_DELAY_SECONDS` env var
 - **Consistent responses** for reproducible benchmarks
 - **Lightweight testing** without GPU requirements
 **Mock server usage:**
 ```bash
 uv run python openai-mock-server.py --port 8080
 ```
 The mock server is also deployed in k8s as `openai-mock-service:8080` and can be used by changing the Llama Stack configuration to use the `mock-vllm-inference` provider.
 ## Files in this Directory
 - `benchmark.py` - Core benchmark script with async streaming support
 - `run-benchmark.sh` - Main script with target selection and configuration
 - `openai-mock-server.py` - Mock OpenAI API server for local testing
 - `README.md` - This documentation file
--- a/docs/source/distributions/k8s-benchmark/apply.sh
+++ b/docs/source/distributions/k8s-benchmark/apply.sh
@ -8,7 +8,6 @@
 # Deploys the benchmark-specific components on top of the base k8s deployment (../k8s/apply.sh).
 export MOCK_INFERENCE_PORT=8080
 export STREAM_DELAY_SECONDS=0.005
 export POSTGRES_USER=llamastack
@ -20,14 +19,7 @@ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 export MOCK_INFERENCE_MODEL=mock-inference
-# Use llama-stack-benchmark-service as the benchmark server
+export MOCK_INFERENCE_URL=openai-mock-service:8080
 export LOCUST_HOST=http://llama-stack-benchmark-service:8323
 export LOCUST_BASE_PATH=/v1/openai/v1
 # Use vllm-service as the benchmark server
 # export LOCUST_HOST=http://vllm-server:8000
 # export LOCUST_BASE_PATH=/v1
 export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
@ -35,13 +27,6 @@ set -euo pipefail
 set -x
 # Deploy benchmark-specific components
 # Deploy OpenAI mock server
 kubectl create configmap openai-mock --from-file=openai-mock-server.py \
  --dry-run=client -o yaml | kubectl apply --validate=false -f -
 envsubst < openai-mock-deployment.yaml | kubectl apply --validate=false -f -
 # Create configmap with our custom stack config
 kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
  --dry-run=client -o yaml > stack-configmap.yaml
@ -49,9 +34,3 @@ kubectl apply --validate=false -f stack-configmap.yaml
 # Deploy our custom llama stack server (overriding the base one)
 envsubst < stack-k8s.yaml.template | kubectl apply --validate=false -f -
 # Deploy Locust load testing
 kubectl create configmap locust-script --from-file=locustfile.py \
  --dry-run=client -o yaml | kubectl apply --validate=false -f -
 envsubst < locust-k8s.yaml | kubectl apply --validate=false -f -
--- a/docs/source/distributions/k8s-benchmark/benchmark.py
+++ b/docs/source/distributions/k8s-benchmark/benchmark.py
@ -0,0 +1,267 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 """
 Simple benchmark script for Llama Stack with OpenAI API compatibility.
 """
 import argparse
 import asyncio
 import os
 import random
 import statistics
 import time
 from typing import Tuple
 import aiohttp
 class BenchmarkStats:
    def __init__(self):
        self.response_times = []
        self.ttft_times = []
        self.chunks_received = []
        self.errors = []
        self.success_count = 0
        self.total_requests = 0
        self.concurrent_users = 0
        self.start_time = None
        self.end_time = None
        self._lock = asyncio.Lock()
    async def add_result(self, response_time: float, chunks: int, ttft: float = None, error: str = None):
        async with self._lock:
            self.total_requests += 1
            if error:
                self.errors.append(error)
            else:
                self.success_count += 1
                self.response_times.append(response_time)
                self.chunks_received.append(chunks)
                if ttft is not None:
                    self.ttft_times.append(ttft)
    def print_summary(self):
        if not self.response_times:
            print("No successful requests to report")
            if self.errors:
                print(f"Total errors: {len(self.errors)}")
                print("First 5 errors:")
                for error in self.errors[:5]:
                    print(f"  {error}")
            return
        total_time = self.end_time - self.start_time
        success_rate = (self.success_count / self.total_requests) * 100
        print(f"\n{'='*60}")
        print(f"BENCHMARK RESULTS")
        print(f"{'='*60}")
        print(f"Total time: {total_time:.2f}s")
        print(f"Concurrent users: {self.concurrent_users}")
        print(f"Total requests: {self.total_requests}")
        print(f"Successful requests: {self.success_count}")
        print(f"Failed requests: {len(self.errors)}")
        print(f"Success rate: {success_rate:.1f}%")
        print(f"Requests per second: {self.success_count / total_time:.2f}")
        print(f"\nResponse Time Statistics:")
        print(f"  Mean: {statistics.mean(self.response_times):.3f}s")
        print(f"  Median: {statistics.median(self.response_times):.3f}s")
        print(f"  Min: {min(self.response_times):.3f}s")
        print(f"  Max: {max(self.response_times):.3f}s")
        if len(self.response_times) > 1:
            print(f"  Std Dev: {statistics.stdev(self.response_times):.3f}s")
        percentiles = [50, 90, 95, 99]
        sorted_times = sorted(self.response_times)
        print(f"\nPercentiles:")
        for p in percentiles:
            idx = int(len(sorted_times) * p / 100) - 1
            idx = max(0, min(idx, len(sorted_times) - 1))
            print(f"  P{p}: {sorted_times[idx]:.3f}s")
        if self.ttft_times:
            print(f"\nTime to First Token (TTFT) Statistics:")
            print(f"  Mean: {statistics.mean(self.ttft_times):.3f}s")
            print(f"  Median: {statistics.median(self.ttft_times):.3f}s")
            print(f"  Min: {min(self.ttft_times):.3f}s")
            print(f"  Max: {max(self.ttft_times):.3f}s")
            if len(self.ttft_times) > 1:
                print(f"  Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
            sorted_ttft = sorted(self.ttft_times)
            print(f"\nTTFT Percentiles:")
            for p in percentiles:
                idx = int(len(sorted_ttft) * p / 100) - 1
                idx = max(0, min(idx, len(sorted_ttft) - 1))
                print(f"  P{p}: {sorted_ttft[idx]:.3f}s")
        if self.chunks_received:
            print(f"\nStreaming Statistics:")
            print(f"  Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
            print(f"  Total chunks received: {sum(self.chunks_received)}")
        if self.errors:
            print(f"\nErrors (showing first 5):")
            for error in self.errors[:5]:
                print(f"  {error}")
 class LlamaStackBenchmark:
    def __init__(self, base_url: str, model_id: str):
        self.base_url = base_url.rstrip('/')
        self.model_id = model_id
        self.headers = {"Content-Type": "application/json"}
        self.test_messages = [
            [{"role": "user", "content": "Hi"}],
            [{"role": "user", "content": "What is the capital of France?"}],
            [{"role": "user", "content": "Explain quantum physics in simple terms."}],
            [{"role": "user", "content": "Write a short story about a robot learning to paint."}],
            [
                {"role": "user", "content": "What is machine learning?"},
                {"role": "assistant", "content": "Machine learning is a subset of AI..."},
                {"role": "user", "content": "Can you give me a practical example?"}
            ]
        ]
    async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
        """Make a single async streaming chat completion request."""
        messages = random.choice(self.test_messages)
        payload = {
            "model": self.model_id,
            "messages": messages,
            "stream": True,
            "max_tokens": 100
        }
        start_time = time.time()
        chunks_received = 0
        ttft = None
        error = None
        session = aiohttp.ClientSession()
        try:
            async with session.post(
                f"{self.base_url}/chat/completions",
                headers=self.headers,
                json=payload,
                timeout=aiohttp.ClientTimeout(total=30)
            ) as response:
                if response.status == 200:
                    async for line in response.content:
                        if line:
                            line_str = line.decode('utf-8').strip()
                            if line_str.startswith('data: '):
                                chunks_received += 1
                                if ttft is None:
                                    ttft = time.time() - start_time
                                if line_str == 'data: [DONE]':
                                    break
                    if chunks_received == 0:
                        error = "No streaming chunks received"
                else:
                    text = await response.text()
                    error = f"HTTP {response.status}: {text[:100]}"
        except Exception as e:
            error = f"Request error: {str(e)}"
        finally:
            await session.close()
        response_time = time.time() - start_time
        return response_time, chunks_received, ttft, error
    async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
        """Run benchmark using async requests for specified duration."""
        stats = BenchmarkStats()
        stats.concurrent_users = concurrent_users
        stats.start_time = time.time()
        print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users")
        print(f"Target URL: {self.base_url}/chat/completions")
        print(f"Model: {self.model_id}")
        connector = aiohttp.TCPConnector(limit=concurrent_users)
        async with aiohttp.ClientSession(connector=connector) as session:
            async def worker(worker_id: int):
                """Worker that sends requests sequentially until canceled."""
                request_count = 0
                while True:
                    try:
                        response_time, chunks, ttft, error = await self.make_async_streaming_request()
                        await stats.add_result(response_time, chunks, ttft, error)
                        request_count += 1
                    except asyncio.CancelledError:
                        break
                    except Exception as e:
                        await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}")
            # Progress reporting task
            async def progress_reporter():
                last_report_time = time.time()
                while True:
                    try:
                        await asyncio.sleep(1)  # Report every second
                        if time.time() >= last_report_time + 10:  # Report every 10 seconds
                            elapsed = time.time() - stats.start_time
                            print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s")
                            last_report_time = time.time()
                    except asyncio.CancelledError:
                        break
            # Spawn concurrent workers
            tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)]
            progress_task = asyncio.create_task(progress_reporter())
            tasks.append(progress_task)
            # Wait for duration then cancel all tasks
            await asyncio.sleep(duration)
            for task in tasks:
                task.cancel()
            # Wait for all tasks to complete
            await asyncio.gather(*tasks, return_exceptions=True)
        stats.end_time = time.time()
        return stats
 def main():
    parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
    parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
                       help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)")
    parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"),
                       help="Model ID to use for requests")
    parser.add_argument("--duration", type=int, default=60,
                       help="Duration in seconds to run benchmark (default: 60)")
    parser.add_argument("--concurrent", type=int, default=10,
                       help="Number of concurrent users (default: 10)")
    args = parser.parse_args()
    benchmark = LlamaStackBenchmark(args.base_url, args.model)
    try:
        stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent))
        stats.print_summary()
    except KeyboardInterrupt:
        print("\nBenchmark interrupted by user")
    except Exception as e:
        print(f"Benchmark failed: {e}")
 if __name__ == "__main__":
    main()
--- a/docs/source/distributions/k8s-benchmark/locust-k8s.yaml
+++ b/docs/source/distributions/k8s-benchmark/locust-k8s.yaml
@ -1,131 +0,0 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: locust-master
  labels:
    app: locust
    role: master
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: locust
      role: master
  template:
    metadata:
      labels:
        app: locust
        role: master
    spec:
      containers:
      - name: locust-master
        image: locustio/locust:2.31.8
        ports:
        - containerPort: 8089  # Web UI
        - containerPort: 5557  # Master communication
        env:
        - name: LOCUST_HOST
          value: "${LOCUST_HOST}"
        - name: LOCUST_LOCUSTFILE
          value: "/locust/locustfile.py"
        - name: LOCUST_WEB_HOST
          value: "0.0.0.0"
        - name: LOCUST_MASTER
          value: "true"
        - name: LOCUST_BASE_PATH
          value: "${LOCUST_BASE_PATH}"
        - name: INFERENCE_MODEL
          value: "${BENCHMARK_INFERENCE_MODEL}"
        volumeMounts:
        - name: locust-script
          mountPath: /locust
        command: ["locust"]
        args:
        - "--master"
        - "--web-host=0.0.0.0"
        - "--web-port=8089"
        - "--host=${LOCUST_HOST}"
        - "--locustfile=/locust/locustfile.py"
      volumes:
      - name: locust-script
        configMap:
          name: locust-script
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: locust-worker
  labels:
    app: locust
    role: worker
 spec:
  replicas: 2  # Start with 2 workers, can be scaled up
  selector:
    matchLabels:
      app: locust
      role: worker
  template:
    metadata:
      labels:
        app: locust
        role: worker
    spec:
      containers:
      - name: locust-worker
        image: locustio/locust:2.31.8
        env:
        - name: LOCUST_HOST
          value: "${LOCUST_HOST}"
        - name: LOCUST_LOCUSTFILE
          value: "/locust/locustfile.py"
        - name: LOCUST_MASTER_HOST
          value: "locust-master-service"
        - name: LOCUST_MASTER_PORT
          value: "5557"
        - name: INFERENCE_MODEL
          value: "${BENCHMARK_INFERENCE_MODEL}"
        - name: LOCUST_BASE_PATH
          value: "${LOCUST_BASE_PATH}"
        volumeMounts:
        - name: locust-script
          mountPath: /locust
        command: ["locust"]
        args:
        - "--worker"
        - "--master-host=locust-master-service"
        - "--master-port=5557"
        - "--locustfile=/locust/locustfile.py"
      volumes:
      - name: locust-script
        configMap:
          name: locust-script
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: locust-master-service
 spec:
  selector:
    app: locust
    role: master
  ports:
  - name: web-ui
    port: 8089
    targetPort: 8089
  - name: master-comm
    port: 5557
    targetPort: 5557
  type: ClusterIP
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: locust-web-ui
 spec:
  selector:
    app: locust
    role: master
  ports:
  - port: 8089
    targetPort: 8089
  type: ClusterIP  # Keep internal, use port-forward to access
--- a/docs/source/distributions/k8s-benchmark/locustfile.py
+++ b/docs/source/distributions/k8s-benchmark/locustfile.py
@ -1,78 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 """
 Locust load testing script for Llama Stack with Prism mock OpenAI provider.
 """
 import random
 from locust import HttpUser, task, between
 import os
 base_path = os.getenv("LOCUST_BASE_PATH", "/v1/openai/v1")
 MODEL_ID = os.getenv("INFERENCE_MODEL")
 class LlamaStackUser(HttpUser):
    wait_time = between(0.0, 0.0001)
    def on_start(self):
        """Setup authentication and test data."""
        # No auth required for benchmark server
        self.headers = {
            "Content-Type": "application/json"
        }
        # Test messages of varying lengths
        self.test_messages = [
            [{"role": "user", "content": "Hi"}],
            [{"role": "user", "content": "What is the capital of France?"}],
            [{"role": "user", "content": "Explain quantum physics in simple terms."}],
            [{"role": "user", "content": "Write a short story about a robot learning to paint."}],
            [
                {"role": "user", "content": "What is machine learning?"},
                {"role": "assistant", "content": "Machine learning is a subset of AI..."},
                {"role": "user", "content": "Can you give me a practical example?"}
            ]
        ]
    @task(weight=100)
    def chat_completion_streaming(self):
        """Test streaming chat completion (20% of requests)."""
        messages = random.choice(self.test_messages)
        payload = {
            "model": MODEL_ID, 
            "messages": messages,
            "stream": True,
            "max_tokens": 100
        }
        with self.client.post(
            f"{base_path}/chat/completions",
            headers=self.headers,
            json=payload,
            stream=True,
            catch_response=True
        ) as response:
            if response.status_code == 200:
                chunks_received = 0
                try:
                    for line in response.iter_lines():
                        if line:
                            line_str = line.decode('utf-8')
                            if line_str.startswith('data: '):
                                chunks_received += 1
                                if line_str.strip() == 'data: [DONE]':
                                    break
                    if chunks_received > 0:
                        response.success()
                    else:
                        response.failure("No streaming chunks received")
                except Exception as e:
                    response.failure(f"Streaming error: {e}")
            else:
                response.failure(f"HTTP {response.status_code}: {response.text}")
--- a/docs/source/distributions/k8s-benchmark/openai-mock-deployment.yaml
+++ b/docs/source/distributions/k8s-benchmark/openai-mock-deployment.yaml
@ -1,52 +0,0 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: openai-mock
  labels:
    app: openai-mock
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: openai-mock
  template:
    metadata:
      labels:
        app: openai-mock
    spec:
      containers:
      - name: openai-mock
        image: python:3.12-slim
        ports:
        - containerPort: ${MOCK_INFERENCE_PORT}
        env:
        - name: PORT
          value: "${MOCK_INFERENCE_PORT}"
        - name: MOCK_MODELS
          value: "${MOCK_INFERENCE_MODEL}"
        - name: STREAM_DELAY_SECONDS
          value: "${STREAM_DELAY_SECONDS}"
        command: ["sh", "-c"]
        args:
        - |
          pip install flask &&
          python /app/openai-mock-server.py --port ${MOCK_INFERENCE_PORT}
        volumeMounts:
        - name: openai-mock-script
          mountPath: /app
      volumes:
      - name: openai-mock-script
        configMap:
          name: openai-mock
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: openai-mock-service
 spec:
  selector:
    app: openai-mock
  ports:
  - port: 8080
    targetPort: 8080
  type: ClusterIP
--- a/docs/source/distributions/k8s-benchmark/openai-mock-server.py
+++ b/docs/source/distributions/k8s-benchmark/openai-mock-server.py
@ -23,7 +23,7 @@ app = Flask(__name__)
 # Models from environment variables
 def get_models():
-    models_str = os.getenv("MOCK_MODELS", "mock-inference")
+    models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
    model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
    return {
@ -49,13 +49,13 @@ def generate_random_text(length=50):
    ]
    return " ".join(random.choices(words, k=length))
-@app.route('/models', methods=['GET'])
+@app.route('/v1/models', methods=['GET'])
 def list_models():
    models = get_models()
    print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
    return jsonify(models)
-@app.route('/chat/completions', methods=['POST'])
+@app.route('/v1/chat/completions', methods=['POST'])
 def chat_completions():
    """Return OpenAI-formatted chat completion responses."""
    data = request.get_json()
--- a/docs/source/distributions/k8s-benchmark/profile_running_server.sh
+++ b/docs/source/distributions/k8s-benchmark/profile_running_server.sh
@ -0,0 +1,52 @@
 #!/bin/bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Script to profile an already running Llama Stack server
 # Usage: ./profile_running_server.sh [duration_seconds] [output_file]
 DURATION=${1:-60}  # Default 60 seconds
 OUTPUT_FILE=${2:-"llama_stack_profile"}  # Default output file
 echo "Looking for running Llama Stack server..."
 # Find the server PID
 SERVER_PID=$(ps aux | grep "llama_stack.core.server.server" | grep -v grep | awk '{print $2}' | head -1)
 if [ -z "$SERVER_PID" ]; then
    echo "Error: No running Llama Stack server found"
    echo "Please start your server first with:"
    echo "LLAMA_STACK_LOGGING=\"all=ERROR\" MOCK_INFERENCE_URL=http://localhost:8080 SAFETY_MODEL=llama-guard3:1b uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml"
    exit 1
 fi
 echo "Found Llama Stack server with PID: $SERVER_PID"
 # Start py-spy profiling
 echo "Starting py-spy profiling for ${DURATION} seconds..."
 echo "Output will be saved to: ${OUTPUT_FILE}.svg"
 echo ""
 echo "You can now run your load test..."
 echo ""
 # Get the full path to py-spy
 PYSPY_PATH=$(which py-spy)
 # Check if running as root, if not, use sudo
 if [ "$EUID" -ne 0 ]; then
    echo "py-spy requires root permissions on macOS. Running with sudo..."
    sudo "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
 else
    "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
 fi
 echo ""
 echo "Profiling completed! Results saved to: ${OUTPUT_FILE}.svg"
 echo ""
 echo "To view the flame graph:"
 echo "open ${OUTPUT_FILE}.svg"
--- a/docs/source/distributions/k8s-benchmark/run-benchmark.sh
+++ b/docs/source/distributions/k8s-benchmark/run-benchmark.sh
@ -0,0 +1,148 @@
 #!/usr/bin/env bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 set -euo pipefail
 # Default values
 TARGET="stack"
 DURATION=60
 CONCURRENT=10
 # Parse command line arguments
 usage() {
    echo "Usage: $0 [options]"
    echo "Options:"
    echo "  -t, --target <stack|vllm>     Target to benchmark (default: stack)"
    echo "  -d, --duration <seconds>      Duration in seconds (default: 60)"
    echo "  -c, --concurrent <users>      Number of concurrent users (default: 10)"
    echo "  -h, --help                    Show this help message"
    echo ""
    echo "Examples:"
    echo "  $0 --target vllm              # Benchmark vLLM direct"
    echo "  $0 --target stack             # Benchmark Llama Stack (default)"
    echo "  $0 -t vllm -d 120 -c 20       # vLLM with 120s duration, 20 users"
 }
 while [[ $# -gt 0 ]]; do
    case $1 in
        -t|--target)
            TARGET="$2"
            shift 2
            ;;
        -d|--duration)
            DURATION="$2"
            shift 2
            ;;
        -c|--concurrent)
            CONCURRENT="$2"
            shift 2
            ;;
        -h|--help)
            usage
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            usage
            exit 1
            ;;
    esac
 done
 # Validate target
 if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
    echo "Error: Target must be 'stack' or 'vllm'"
    usage
    exit 1
 fi
 # Set configuration based on target
 if [[ "$TARGET" == "vllm" ]]; then
    BASE_URL="http://vllm-server:8000/v1"
    JOB_NAME="vllm-benchmark-job"
    echo "Benchmarking vLLM direct..."
 else
    BASE_URL="http://llama-stack-benchmark-service:8323/v1/openai/v1"
    JOB_NAME="stack-benchmark-job"
    echo "Benchmarking Llama Stack..."
 fi
 echo "Configuration:"
 echo "  Target: $TARGET"
 echo "  Base URL: $BASE_URL"
 echo "  Duration: ${DURATION}s"
 echo "  Concurrent users: $CONCURRENT"
 echo ""
 # Create temporary job yaml
 TEMP_YAML="/tmp/benchmark-job-temp-$(date +%s).yaml"
 cat > "$TEMP_YAML" << EOF
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: $JOB_NAME
  namespace: default
 spec:
  template:
    spec:
      containers:
      - name: benchmark
        image: python:3.11-slim
        command: ["/bin/bash"]
        args:
        - "-c"
        - |
          pip install aiohttp &&
          python3 /benchmark/benchmark.py \\
            --base-url $BASE_URL \\
            --model \${INFERENCE_MODEL} \\
            --duration $DURATION \\
            --concurrent $CONCURRENT
        env:
        - name: INFERENCE_MODEL
          value: "meta-llama/Llama-3.2-3B-Instruct"
        volumeMounts:
        - name: benchmark-script
          mountPath: /benchmark
        resources:
          requests:
            memory: "256Mi"
            cpu: "250m"
          limits:
            memory: "512Mi"
            cpu: "500m"
      volumes:
      - name: benchmark-script
        configMap:
          name: benchmark-script
      restartPolicy: Never
  backoffLimit: 3
 EOF
 echo "Creating benchmark ConfigMap..."
 kubectl create configmap benchmark-script \
  --from-file=benchmark.py=benchmark.py \
  --dry-run=client -o yaml | kubectl apply -f -
 echo "Cleaning up any existing benchmark job..."
 kubectl delete job $JOB_NAME 2>/dev/null || true
 echo "Deploying benchmark Job..."
 kubectl apply -f "$TEMP_YAML"
 echo "Waiting for job to start..."
 kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=60s
 echo "Following benchmark logs..."
 kubectl logs -f job/$JOB_NAME
 echo "Job completed. Checking final status..."
 kubectl get job $JOB_NAME
 # Clean up temporary file
 rm -f "$TEMP_YAML"
--- a/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
@ -26,13 +26,6 @@ data:
          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
          api_token: ${env.VLLM_API_TOKEN:=fake}
          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
      - provider_id: mock-vllm-inference
        provider_type: remote::vllm
        config:
          url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT}
          max_tokens: 4096
          api_token: fake
          tls_verify: false
      - provider_id: sentence-transformers
        provider_type: inline::sentence-transformers
        config: {}
@ -121,9 +114,6 @@ data:
    - model_id: ${env.SAFETY_MODEL}
      provider_id: vllm-safety
      model_type: llm
    - model_id: ${env.MOCK_INFERENCE_MODEL}
      provider_id: mock-vllm-inference
      model_type: llm
    shields:
    - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
    vector_dbs: []
--- a/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template
@ -44,8 +44,6 @@ spec:
          value: "${SAFETY_MODEL}"
        - name: TAVILY_SEARCH_API_KEY
          value: "${TAVILY_SEARCH_API_KEY}"
        - name: MOCK_INFERENCE_PORT
          value: "${MOCK_INFERENCE_PORT}"
        - name: VLLM_URL
          value: http://vllm-server.default.svc.cluster.local:8000/v1
        - name: VLLM_MAX_TOKENS
@ -54,8 +52,6 @@ spec:
          value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
        - name: VLLM_TLS_VERIFY
          value: "false"
        - name: MOCK_INFERENCE_MODEL
          value: "${MOCK_INFERENCE_MODEL}"
        command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
        ports:
          - containerPort: 8323
--- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
@ -3,7 +3,6 @@ image_name: kubernetes-benchmark-demo
 apis:
 - agents
 - inference
 - safety
 - telemetry
 - tool_runtime
 - vector_io
@ -16,20 +15,6 @@ providers:
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: vllm-safety
    provider_type: remote::vllm
    config:
      url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: mock-vllm-inference
    provider_type: remote::vllm
    config:
      url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT}
      max_tokens: 4096
      api_token: fake
      tls_verify: false
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
    config: {}
@ -45,11 +30,6 @@ providers:
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config:
      excluded_categories: []
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
@ -115,14 +95,6 @@ models:
 - model_id: ${env.INFERENCE_MODEL}
  provider_id: vllm-inference
  model_type: llm
 - model_id: ${env.SAFETY_MODEL}
  provider_id: vllm-safety
  model_type: llm
 - model_id: ${env.MOCK_INFERENCE_MODEL}
  provider_id: mock-vllm-inference
  model_type: llm
 shields:
 - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
 vector_dbs: []
 datasets: []
 scoring_fns: []
--- a/docs/source/providers/agents/index.md
+++ b/docs/source/providers/agents/index.md
@ -2,6 +2,15 @@
 ## Overview
 Agents API for creating and interacting with agentic systems.
    Main functionalities provided by this API:
    - Create agents with specific instructions and ability to use tools.
    - Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn".
    - Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
    - Agents can be provided with various shields (see the Safety API for more details).
    - Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
 This section contains documentation for all available providers for the **agents** API.
 ## Providers
--- a/docs/source/providers/batches/index.md
+++ b/docs/source/providers/batches/index.md
@ -0,0 +1,24 @@
 # Batches
 ## Overview
 The Batches API enables efficient processing of multiple requests in a single operation,
    particularly useful for processing large datasets, batch evaluation workflows, and
    cost-effective inference at scale.
    The API is designed to allow use of openai client libraries for seamless integration.
    This API provides the following extensions:
     - idempotent batch creation
    Note: This API is currently under active development and may undergo changes.
 This section contains documentation for all available providers for the **batches** API.
 ## Providers
 ```{toctree}
 :maxdepth: 1
 inline_reference
 ```
--- a/docs/source/providers/batches/inline_reference.md
+++ b/docs/source/providers/batches/inline_reference.md
@ -0,0 +1,23 @@
 # inline::reference
 ## Description
 Reference implementation of batches API with KVStore persistence.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Configuration for the key-value store backend. |
 | `max_concurrent_batches` | `<class 'int'>` | No | 1 | Maximum number of concurrent batches to process simultaneously. |
 | `max_concurrent_requests_per_batch` | `<class 'int'>` | No | 10 | Maximum number of concurrent requests to process per batch. |
 ## Sample Configuration
 ```yaml
 kvstore:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/batches.db
 ```
--- a/docs/source/providers/eval/index.md
+++ b/docs/source/providers/eval/index.md
@ -2,6 +2,8 @@
 ## Overview
 Llama Stack Evaluation API for running evaluations on model and agent candidates.
 This section contains documentation for all available providers for the **eval** API.
 ## Providers
--- a/docs/source/providers/files/index.md
+++ b/docs/source/providers/files/index.md
@ -10,4 +10,5 @@ This section contains documentation for all available providers for the **files*
 :maxdepth: 1
 inline_localfs
 remote_s3
 ```
--- a/docs/source/providers/files/remote_s3.md
+++ b/docs/source/providers/files/remote_s3.md
@ -0,0 +1,33 @@
 # remote::s3
 ## Description
 AWS S3-based file storage provider for scalable cloud file management with metadata persistence.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `bucket_name` | `<class 'str'>` | No |  | S3 bucket name to store files |
 | `region` | `<class 'str'>` | No | us-east-1 | AWS region where the bucket is located |
 | `aws_access_key_id` | `str \| None` | No |  | AWS access key ID (optional if using IAM roles) |
 | `aws_secret_access_key` | `str \| None` | No |  | AWS secret access key (optional if using IAM roles) |
 | `endpoint_url` | `str \| None` | No |  | Custom S3 endpoint URL (for MinIO, LocalStack, etc.) |
 | `auto_create_bucket` | `<class 'bool'>` | No | False | Automatically create the S3 bucket if it doesn't exist |
 | `metadata_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | SQL store configuration for file metadata |
 ## Sample Configuration
 ```yaml
 bucket_name: ${env.S3_BUCKET_NAME}
 region: ${env.AWS_REGION:=us-east-1}
 aws_access_key_id: ${env.AWS_ACCESS_KEY_ID:=}
 aws_secret_access_key: ${env.AWS_SECRET_ACCESS_KEY:=}
 endpoint_url: ${env.S3_ENDPOINT_URL:=}
 auto_create_bucket: ${env.S3_AUTO_CREATE_BUCKET:=false}
 metadata_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/s3_files_metadata.db
 ```
--- a/docs/source/providers/inference/index.md
+++ b/docs/source/providers/inference/index.md
@ -2,6 +2,12 @@
 ## Overview
 Llama Stack Inference API for generating completions, chat completions, and embeddings.
    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate "raw" and "chat" (conversational) completions.
    - Embedding models: these models generate embeddings to be used for semantic search.
 This section contains documentation for all available providers for the **inference** API.
 ## Providers
--- a/docs/source/providers/post_training/index.md
+++ b/docs/source/providers/post_training/index.md
@ -9,7 +9,9 @@ This section contains documentation for all available providers for the **post_t
 ```{toctree}
 :maxdepth: 1
-inline_huggingface
+inline_huggingface-cpu
-inline_torchtune
+inline_huggingface-gpu
 inline_torchtune-cpu
 inline_torchtune-gpu
 remote_nvidia
 ```
--- a/docs/source/providers/post_training/inline_huggingface-cpu.md
+++ b/docs/source/providers/post_training/inline_huggingface-cpu.md
@ -0,0 +1,41 @@
 # inline::huggingface-cpu
 ## Description
 HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `device` | `<class 'str'>` | No | cuda |  |
 | `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No |  |  |
 | `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface |  |
 | `chat_template` | `<class 'str'>` | No | <|user|>
 {input}
 <|assistant|>
 {output} |  |
 | `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} |  |
 | `max_seq_length` | `<class 'int'>` | No | 2048 |  |
 | `gradient_checkpointing` | `<class 'bool'>` | No | False |  |
 | `save_total_limit` | `<class 'int'>` | No | 3 |  |
 | `logging_steps` | `<class 'int'>` | No | 10 |  |
 | `warmup_ratio` | `<class 'float'>` | No | 0.1 |  |
 | `weight_decay` | `<class 'float'>` | No | 0.01 |  |
 | `dataloader_num_workers` | `<class 'int'>` | No | 4 |  |
 | `dataloader_pin_memory` | `<class 'bool'>` | No | True |  |
 | `dpo_beta` | `<class 'float'>` | No | 0.1 |  |
 | `use_reference_model` | `<class 'bool'>` | No | True |  |
 | `dpo_loss_type` | `Literal['sigmoid', 'hinge', 'ipo', 'kto_pair'` | No | sigmoid |  |
 | `dpo_output_dir` | `<class 'str'>` | No |  |  |
 ## Sample Configuration
 ```yaml
 checkpoint_format: huggingface
 distributed_backend: null
 device: cpu
 dpo_output_dir: ~/.llama/dummy/dpo_output
 ```
--- a/docs/source/providers/post_training/inline_huggingface-gpu.md
+++ b/docs/source/providers/post_training/inline_huggingface-gpu.md
@ -0,0 +1,41 @@
 # inline::huggingface-gpu
 ## Description
 HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `device` | `<class 'str'>` | No | cuda |  |
 | `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No |  |  |
 | `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface |  |
 | `chat_template` | `<class 'str'>` | No | <|user|>
 {input}
 <|assistant|>
 {output} |  |
 | `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} |  |
 | `max_seq_length` | `<class 'int'>` | No | 2048 |  |
 | `gradient_checkpointing` | `<class 'bool'>` | No | False |  |
 | `save_total_limit` | `<class 'int'>` | No | 3 |  |
 | `logging_steps` | `<class 'int'>` | No | 10 |  |
 | `warmup_ratio` | `<class 'float'>` | No | 0.1 |  |
 | `weight_decay` | `<class 'float'>` | No | 0.01 |  |
 | `dataloader_num_workers` | `<class 'int'>` | No | 4 |  |
 | `dataloader_pin_memory` | `<class 'bool'>` | No | True |  |
 | `dpo_beta` | `<class 'float'>` | No | 0.1 |  |
 | `use_reference_model` | `<class 'bool'>` | No | True |  |
 | `dpo_loss_type` | `Literal['sigmoid', 'hinge', 'ipo', 'kto_pair'` | No | sigmoid |  |
 | `dpo_output_dir` | `<class 'str'>` | No |  |  |
 ## Sample Configuration
 ```yaml
 checkpoint_format: huggingface
 distributed_backend: null
 device: cpu
 dpo_output_dir: ~/.llama/dummy/dpo_output
 ```
--- a/docs/source/providers/post_training/inline_torchtune-cpu.md
+++ b/docs/source/providers/post_training/inline_torchtune-cpu.md
@ -0,0 +1,20 @@
 # inline::torchtune-cpu
 ## Description
 TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `torch_seed` | `int \| None` | No |  |  |
 | `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta |  |
 ## Sample Configuration
 ```yaml
 checkpoint_format: meta
 ```
--- a/docs/source/providers/post_training/inline_torchtune-gpu.md
+++ b/docs/source/providers/post_training/inline_torchtune-gpu.md
@ -0,0 +1,20 @@
 # inline::torchtune-gpu
 ## Description
 TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `torch_seed` | `int \| None` | No |  |  |
 | `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta |  |
 ## Sample Configuration
 ```yaml
 checkpoint_format: meta
 ```
--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@ -623,6 +623,62 @@ class OpenAIResponseObjectStreamResponseMcpCallCompleted(BaseModel):
    type: Literal["response.mcp_call.completed"] = "response.mcp_call.completed"
@json_schema_type
 class OpenAIResponseContentPartOutputText(BaseModel):
    type: Literal["output_text"] = "output_text"
    text: str
    # TODO: add annotations, logprobs, etc.
@json_schema_type
 class OpenAIResponseContentPartRefusal(BaseModel):
    type: Literal["refusal"] = "refusal"
    refusal: str
 OpenAIResponseContentPart = Annotated[
    OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal,
    Field(discriminator="type"),
 ]
 register_schema(OpenAIResponseContentPart, name="OpenAIResponseContentPart")
@json_schema_type
 class OpenAIResponseObjectStreamResponseContentPartAdded(BaseModel):
    """Streaming event for when a new content part is added to a response item.
    :param response_id: Unique identifier of the response containing this content
    :param item_id: Unique identifier of the output item containing this content part
    :param part: The content part that was added
    :param sequence_number: Sequential number for ordering streaming events
    :param type: Event type identifier, always "response.content_part.added"
    """
    response_id: str
    item_id: str
    part: OpenAIResponseContentPart
    sequence_number: int
    type: Literal["response.content_part.added"] = "response.content_part.added"
@json_schema_type
 class OpenAIResponseObjectStreamResponseContentPartDone(BaseModel):
    """Streaming event for when a content part is completed.
    :param response_id: Unique identifier of the response containing this content
    :param item_id: Unique identifier of the output item containing this content part
    :param part: The completed content part
    :param sequence_number: Sequential number for ordering streaming events
    :param type: Event type identifier, always "response.content_part.done"
    """
    response_id: str
    item_id: str
    part: OpenAIResponseContentPart
    sequence_number: int
    type: Literal["response.content_part.done"] = "response.content_part.done"
 OpenAIResponseObjectStream = Annotated[
    OpenAIResponseObjectStreamResponseCreated
    | OpenAIResponseObjectStreamResponseOutputItemAdded
@ -642,6 +698,8 @@ OpenAIResponseObjectStream = Annotated[
    | OpenAIResponseObjectStreamResponseMcpCallInProgress
    | OpenAIResponseObjectStreamResponseMcpCallFailed
    | OpenAIResponseObjectStreamResponseMcpCallCompleted
    | OpenAIResponseObjectStreamResponseContentPartAdded
    | OpenAIResponseObjectStreamResponseContentPartDone
    | OpenAIResponseObjectStreamResponseCompleted,
    Field(discriminator="type"),
 ]
--- a/llama_stack/apis/batches/init.py
+++ b/llama_stack/apis/batches/init.py
@ -0,0 +1,9 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .batches import Batches, BatchObject, ListBatchesResponse
 __all__ = ["Batches", "BatchObject", "ListBatchesResponse"]
--- a/llama_stack/apis/batches/batches.py
+++ b/llama_stack/apis/batches/batches.py
@ -0,0 +1,95 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Literal, Protocol, runtime_checkable
 from pydantic import BaseModel, Field
 from llama_stack.schema_utils import json_schema_type, webmethod
 try:
    from openai.types import Batch as BatchObject
 except ImportError as e:
    raise ImportError("OpenAI package is required for batches API. Please install it with: pip install openai") from e
@json_schema_type
 class ListBatchesResponse(BaseModel):
    """Response containing a list of batch objects."""
    object: Literal["list"] = "list"
    data: list[BatchObject] = Field(..., description="List of batch objects")
    first_id: str | None = Field(default=None, description="ID of the first batch in the list")
    last_id: str | None = Field(default=None, description="ID of the last batch in the list")
    has_more: bool = Field(default=False, description="Whether there are more batches available")
@runtime_checkable
 class Batches(Protocol):
    """
    The Batches API enables efficient processing of multiple requests in a single operation,
    particularly useful for processing large datasets, batch evaluation workflows, and
    cost-effective inference at scale.
    The API is designed to allow use of openai client libraries for seamless integration.
    This API provides the following extensions:
     - idempotent batch creation
    Note: This API is currently under active development and may undergo changes.
    """
    @webmethod(route="/openai/v1/batches", method="POST")
    async def create_batch(
        self,
        input_file_id: str,
        endpoint: str,
        completion_window: Literal["24h"],
        metadata: dict[str, str] | None = None,
        idempotency_key: str | None = None,
    ) -> BatchObject:
        """Create a new batch for processing multiple API requests.
        :param input_file_id: The ID of an uploaded file containing requests for the batch.
        :param endpoint: The endpoint to be used for all requests in the batch.
        :param completion_window: The time window within which the batch should be processed.
        :param metadata: Optional metadata for the batch.
        :param idempotency_key: Optional idempotency key. When provided, enables idempotent behavior.
        :returns: The created batch object.
        """
        ...
    @webmethod(route="/openai/v1/batches/{batch_id}", method="GET")
    async def retrieve_batch(self, batch_id: str) -> BatchObject:
        """Retrieve information about a specific batch.
        :param batch_id: The ID of the batch to retrieve.
        :returns: The batch object.
        """
        ...
    @webmethod(route="/openai/v1/batches/{batch_id}/cancel", method="POST")
    async def cancel_batch(self, batch_id: str) -> BatchObject:
        """Cancel a batch that is in progress.
        :param batch_id: The ID of the batch to cancel.
        :returns: The updated batch object.
        """
        ...
    @webmethod(route="/openai/v1/batches", method="GET")
    async def list_batches(
        self,
        after: str | None = None,
        limit: int = 20,
    ) -> ListBatchesResponse:
        """List all batches for the current user.
        :param after: A cursor for pagination; returns batches after this batch ID.
        :param limit: Number of batches to return (default 20, max 100).
        :returns: A list of batch objects.
        """
        ...
--- a/llama_stack/apis/common/errors.py
+++ b/llama_stack/apis/common/errors.py
@ -72,3 +72,10 @@ class ModelTypeError(TypeError):
            f"Model '{model_name}' is of type '{model_type}' rather than the expected type '{expected_model_type}'"
        )
        super().__init__(message)
 class ConflictError(ValueError):
    """raised when an operation cannot be performed due to a conflict with the current state"""
    def __init__(self, message: str) -> None:
        super().__init__(message)
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@ -86,6 +86,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
    :cvar inference: Text generation, chat completions, and embeddings
    :cvar safety: Content moderation and safety shields
    :cvar agents: Agent orchestration and execution
    :cvar batches: Batch processing for asynchronous API requests
    :cvar vector_io: Vector database operations and queries
    :cvar datasetio: Dataset input/output operations
    :cvar scoring: Model output evaluation and scoring
@ -108,6 +109,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
    inference = "inference"
    safety = "safety"
    agents = "agents"
    batches = "batches"
    vector_io = "vector_io"
    datasetio = "datasetio"
    scoring = "scoring"
--- a/llama_stack/apis/files/files.py
+++ b/llama_stack/apis/files/files.py
@ -22,6 +22,7 @@ class OpenAIFilePurpose(StrEnum):
    """
    ASSISTANTS = "assistants"
    BATCH = "batch"
    # TODO: Add other purposes as needed
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -473,6 +473,28 @@ class EmbeddingsResponse(BaseModel):
    embeddings: list[list[float]]
@json_schema_type
 class RerankData(BaseModel):
    """A single rerank result from a reranking response.
    :param index: The original index of the document in the input list
    :param relevance_score: The relevance score from the model output. Values are inverted when applicable so that higher scores indicate greater relevance.
    """
    index: int
    relevance_score: float
@json_schema_type
 class RerankResponse(BaseModel):
    """Response from a reranking request.
    :param data: List of rerank result objects, sorted by relevance score (descending)
    """
    data: list[RerankData]
@json_schema_type
 class OpenAIChatCompletionContentPartTextParam(BaseModel):
    """Text content part for OpenAI-compatible chat completion messages.
@ -1046,6 +1068,7 @@ class InferenceProvider(Protocol):
        :returns: A BatchCompletionResponse with the full completions.
        """
        raise NotImplementedError("Batch completion is not implemented")
        return  # this is so mypy's safe-super rule will consider the method concrete
    @webmethod(route="/inference/chat-completion", method="POST")
    async def chat_completion(
@ -1110,6 +1133,7 @@ class InferenceProvider(Protocol):
        :returns: A BatchChatCompletionResponse with the full completions.
        """
        raise NotImplementedError("Batch chat completion is not implemented")
        return  # this is so mypy's safe-super rule will consider the method concrete
    @webmethod(route="/inference/embeddings", method="POST")
    async def embeddings(
@ -1131,6 +1155,25 @@ class InferenceProvider(Protocol):
        """
        ...
    @webmethod(route="/inference/rerank", method="POST", experimental=True)
    async def rerank(
        self,
        model: str,
        query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
        items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
        max_num_results: int | None = None,
    ) -> RerankResponse:
        """Rerank a list of documents based on their relevance to a query.
        :param model: The identifier of the reranking model to use.
        :param query: The search query to rank items against. Can be a string, text content part, or image content part. The input must not exceed the model's max input token length.
        :param items: List of items to rerank. Each item can be a string, text content part, or image content part. Each input must not exceed the model's max input token length.
        :param max_num_results: (Optional) Maximum number of results to return. Default: returns all.
        :returns: RerankResponse with indices sorted by relevance score (descending).
        """
        raise NotImplementedError("Reranking is not implemented")
        return  # this is so mypy's safe-super rule will consider the method concrete
    @webmethod(route="/openai/v1/completions", method="POST")
    async def openai_completion(
        self,
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@ -386,6 +386,7 @@ class MetricDataPoint(BaseModel):
    timestamp: int
    value: float
    unit: str
@json_schema_type
@ -518,7 +519,7 @@ class Telemetry(Protocol):
        metric_name: str,
        start_time: int,
        end_time: int | None = None,
-        granularity: str | None = "1d",
+        granularity: str | None = None,
        query_type: MetricQueryType = MetricQueryType.RANGE,
        label_matchers: list[MetricLabelMatcher] | None = None,
    ) -> QueryMetricsResponse:
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@ -15,7 +15,7 @@ from llama_stack.log import get_logger
 REPO_ROOT = Path(__file__).parent.parent.parent.parent
-logger = get_logger(name=__name__, category="server")
+logger = get_logger(name=__name__, category="cli")
 class StackRun(Subcommand):
--- a/llama_stack/core/build.py
+++ b/llama_stack/core/build.py
@ -5,7 +5,6 @@
 # the root directory of this source tree.
 import importlib.resources
 import logging
 import sys
 from pydantic import BaseModel
@ -17,9 +16,10 @@ from llama_stack.core.external import load_external_apis
 from llama_stack.core.utils.exec import run_command
 from llama_stack.core.utils.image_types import LlamaStackImageType
 from llama_stack.distributions.template import DistributionTemplate
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="core")
 # These are the dependencies needed by the distribution server.
 # `llama-stack` is automatically installed by the installation script.
--- a/llama_stack/core/build_conda_env.sh
+++ b/llama_stack/core/build_conda_env.sh
@ -1,207 +0,0 @@
 #!/bin/bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
 LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-}
 TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
 PYPI_VERSION=${PYPI_VERSION:-}
 # This timeout (in seconds) is necessary when installing PyTorch via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}
 set -euo pipefail
 # Define color codes
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 NC='\033[0m' # No Color
 SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
 source "$SCRIPT_DIR/common.sh"
 # Usage function
 usage() {
  echo "Usage: $0 --env-name <conda_env_name> --build-file-path <build_file_path> --normal-deps <pip_dependencies> [--external-provider-deps <external_provider_deps>] [--optional-deps <special_pip_deps>]"
  echo "Example: $0 --env-name my-conda-env --build-file-path ./my-stack-build.yaml --normal-deps 'numpy pandas scipy' --external-provider-deps 'foo' --optional-deps 'bar'"
  exit 1
 }
 # Parse arguments
 env_name=""
 build_file_path=""
 normal_deps=""
 external_provider_deps=""
 optional_deps=""
 while [[ $# -gt 0 ]]; do
  key="$1"
  case "$key" in
    --env-name)
      if [[ -z "$2" || "$2" == --* ]]; then
        echo "Error: --env-name requires a string value" >&2
        usage
      fi
      env_name="$2"
      shift 2
      ;;
    --build-file-path)
      if [[ -z "$2" || "$2" == --* ]]; then
        echo "Error: --build-file-path requires a string value" >&2
        usage
      fi
      build_file_path="$2"
      shift 2
      ;;
    --normal-deps)
      if [[ -z "$2" || "$2" == --* ]]; then
        echo "Error: --normal-deps requires a string value" >&2
        usage
      fi
      normal_deps="$2"
      shift 2
      ;;
    --external-provider-deps)
      if [[ -z "$2" || "$2" == --* ]]; then
        echo "Error: --external-provider-deps requires a string value" >&2
        usage
      fi
      external_provider_deps="$2"
      shift 2
      ;;
    --optional-deps)
      if [[ -z "$2" || "$2" == --* ]]; then
        echo "Error: --optional-deps requires a string value" >&2
        usage
      fi
      optional_deps="$2"
      shift 2
      ;;
    *)
      echo "Unknown option: $1" >&2
      usage
      ;;
  esac
 done
 # Check required arguments
 if [[ -z "$env_name" || -z "$build_file_path" || -z "$normal_deps" ]]; then
  echo "Error: --env-name, --build-file-path, and --normal-deps are required." >&2
  usage
 fi
 if [ -n "$LLAMA_STACK_DIR" ]; then
  echo "Using llama-stack-dir=$LLAMA_STACK_DIR"
 fi
 if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
  echo "Using llama-stack-client-dir=$LLAMA_STACK_CLIENT_DIR"
 fi
 ensure_conda_env_python310() {
  # Use only global variables set by flag parser
  local python_version="3.12"
  if ! is_command_available conda; then
    printf "${RED}Error: conda command not found. Is Conda installed and in your PATH?${NC}" >&2
    exit 1
  fi
  if conda env list | grep -q "^${env_name} "; then
    printf "Conda environment '${env_name}' exists. Checking Python version...\n"
    current_version=$(conda run -n "${env_name}" python --version 2>&1 | cut -d' ' -f2 | cut -d'.' -f1,2)
    if [ "$current_version" = "$python_version" ]; then
      printf "Environment '${env_name}' already has Python ${python_version}. No action needed.\n"
    else
      printf "Updating environment '${env_name}' to Python ${python_version}...\n"
      conda install -n "${env_name}" python="${python_version}" -y
    fi
  else
    printf "Conda environment '${env_name}' does not exist. Creating with Python ${python_version}...\n"
    conda create -n "${env_name}" python="${python_version}" -y
  fi
  eval "$(conda shell.bash hook)"
  conda deactivate && conda activate "${env_name}"
  "$CONDA_PREFIX"/bin/pip install uv
  if [ -n "$TEST_PYPI_VERSION" ]; then
    uv pip install fastapi libcst
    uv pip install --extra-index-url https://test.pypi.org/simple/ \
      llama-stack=="$TEST_PYPI_VERSION" \
      "$normal_deps"
    if [ -n "$optional_deps" ]; then
      IFS='#' read -ra parts <<<"$optional_deps"
      for part in "${parts[@]}"; do
        echo "$part"
        uv pip install $part
      done
    fi
    if [ -n "$external_provider_deps" ]; then
      IFS='#' read -ra parts <<<"$external_provider_deps"
      for part in "${parts[@]}"; do
        echo "$part"
        uv pip install "$part"
      done
    fi
  else
    if [ -n "$LLAMA_STACK_DIR" ]; then
      if [ ! -d "$LLAMA_STACK_DIR" ]; then
        printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: $LLAMA_STACK_DIR${NC}\n" >&2
        exit 1
      fi
      printf "Installing from LLAMA_STACK_DIR: $LLAMA_STACK_DIR\n"
      uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"
    else
      PYPI_VERSION="${PYPI_VERSION:-}"
      if [ -n "$PYPI_VERSION" ]; then
        SPEC_VERSION="llama-stack==${PYPI_VERSION}"
      else
        SPEC_VERSION="llama-stack"
      fi
      uv pip install --no-cache-dir "$SPEC_VERSION"
    fi
    if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
      if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then
        printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: $LLAMA_STACK_CLIENT_DIR${NC}\n" >&2
        exit 1
      fi
      printf "Installing from LLAMA_STACK_CLIENT_DIR: $LLAMA_STACK_CLIENT_DIR\n"
      uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
    fi
    printf "Installing pip dependencies\n"
    uv pip install $normal_deps
    if [ -n "$optional_deps" ]; then
      IFS='#' read -ra parts <<<"$optional_deps"
      for part in "${parts[@]}"; do
        echo "$part"
        uv pip install $part
      done
    fi
    if [ -n "$external_provider_deps" ]; then
      IFS='#' read -ra parts <<<"$external_provider_deps"
      for part in "${parts[@]}"; do
        echo "Getting provider spec for module: $part and installing dependencies"
        package_name=$(echo "$part" | sed 's/[<>=!].*//')
        python3 -c "
 import importlib
 import sys
 try:
    module = importlib.import_module(f'$package_name.provider')
    spec = module.get_provider_spec()
    if hasattr(spec, 'pip_packages') and spec.pip_packages:
        print('\\n'.join(spec.pip_packages))
 except Exception as e:
    print(f'Error getting provider spec for $package_name: {e}', file=sys.stderr)
 " | uv pip install -r -
      done
    fi
  fi
  mv "$build_file_path" "$CONDA_PREFIX"/llamastack-build.yaml
  echo "Build spec configuration saved at $CONDA_PREFIX/llamastack-build.yaml"
 }
 ensure_conda_env_python310 "$env_name" "$build_file_path" "$normal_deps" "$optional_deps" "$external_provider_deps"
--- a/llama_stack/core/build_venv.sh
+++ b/llama_stack/core/build_venv.sh
@ -151,23 +151,37 @@ run() {
    fi
  else
    if [ -n "$LLAMA_STACK_DIR" ]; then
-      if [ ! -d "$LLAMA_STACK_DIR" ]; then
+      # only warn if DIR does not start with "git+"
      if [ ! -d "$LLAMA_STACK_DIR" ] && [[ "$LLAMA_STACK_DIR" != git+* ]]; then
        printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_DIR" >&2
        exit 1
      fi
      printf "Installing from LLAMA_STACK_DIR: %s\n"  "$LLAMA_STACK_DIR"
-      uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"
+      # editable only if LLAMA_STACK_DIR does not start with "git+"
      if [[ "$LLAMA_STACK_DIR" != git+* ]]; then
        EDITABLE="-e"
      else
        EDITABLE=""
      fi
      uv pip install --no-cache-dir $EDITABLE "$LLAMA_STACK_DIR"
    else
      uv pip install --no-cache-dir llama-stack
    fi
    if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
-      if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then
+      # only warn if DIR does not start with "git+"
      if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ] && [[ "$LLAMA_STACK_CLIENT_DIR" != git+* ]]; then
        printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_CLIENT_DIR" >&2
        exit 1
      fi
      printf "Installing from LLAMA_STACK_CLIENT_DIR: %s\n" "$LLAMA_STACK_CLIENT_DIR"
-      uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
+      # editable only if LLAMA_STACK_CLIENT_DIR does not start with "git+"
      if [[ "$LLAMA_STACK_CLIENT_DIR" != git+* ]]; then
        EDITABLE="-e"
      else
        EDITABLE=""
      fi
      uv pip install --no-cache-dir $EDITABLE "$LLAMA_STACK_CLIENT_DIR"
    fi
    printf "Installing pip dependencies\n"
--- a/llama_stack/core/configure.py
+++ b/llama_stack/core/configure.py
@ -3,7 +3,6 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import logging
 import textwrap
 from typing import Any
@ -21,9 +20,10 @@ from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars
 from llama_stack.core.utils.config_dirs import EXTERNAL_PROVIDERS_DIR
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.core.utils.prompt_for_config import prompt_for_config
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api, ProviderSpec
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="core")
 def configure_single_provider(registry: dict[str, ProviderSpec], provider: Provider) -> Provider:
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@ -318,6 +318,41 @@ class QuotaConfig(BaseModel):
    period: QuotaPeriod = Field(default=QuotaPeriod.DAY, description="Quota period to set")
 class CORSConfig(BaseModel):
    allow_origins: list[str] = Field(default_factory=list)
    allow_origin_regex: str | None = Field(default=None)
    allow_methods: list[str] = Field(default=["OPTIONS"])
    allow_headers: list[str] = Field(default_factory=list)
    allow_credentials: bool = Field(default=False)
    expose_headers: list[str] = Field(default_factory=list)
    max_age: int = Field(default=600, ge=0)
    @model_validator(mode="after")
    def validate_credentials_config(self) -> Self:
        if self.allow_credentials and (self.allow_origins == ["*"] or "*" in self.allow_origins):
            raise ValueError("Cannot use wildcard origins with credentials enabled")
        return self
 def process_cors_config(cors_config: bool | CORSConfig | None) -> CORSConfig | None:
    if cors_config is False or cors_config is None:
        return None
    if cors_config is True:
        # dev mode: allow localhost on any port
        return CORSConfig(
            allow_origins=[],
            allow_origin_regex=r"https?://localhost:\d+",
            allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
            allow_headers=["Content-Type", "Authorization", "X-Requested-With"],
        )
    if isinstance(cors_config, CORSConfig):
        return cors_config
    raise ValueError(f"Expected bool or CORSConfig, got {type(cors_config).__name__}")
 class ServerConfig(BaseModel):
    port: int = Field(
        default=8321,
@ -349,6 +384,12 @@ class ServerConfig(BaseModel):
        default=None,
        description="Per client quota request configuration",
    )
    cors: bool | CORSConfig | None = Field(
        default=None,
        description="CORS configuration for cross-origin requests. Can be:\n"
        "- true: Enable localhost CORS for development\n"
        "- {allow_origins: [...], allow_methods: [...], ...}: Full configuration",
    )
 class StackRunConfig(BaseModel):
--- a/llama_stack/core/library_client.py
+++ b/llama_stack/core/library_client.py
@ -7,7 +7,7 @@
 import asyncio
 import inspect
 import json
-import logging
+import logging  # allow-direct-logging
 import os
 import sys
 from concurrent.futures import ThreadPoolExecutor
@ -48,6 +48,7 @@ from llama_stack.core.stack import (
 from llama_stack.core.utils.config import redact_sensitive_fields
 from llama_stack.core.utils.context import preserve_contexts_async_generator
 from llama_stack.core.utils.exec import in_notebook
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.telemetry.tracing import (
    CURRENT_TRACE_CONTEXT,
    end_trace,
@ -55,7 +56,7 @@ from llama_stack.providers.utils.telemetry.tracing import (
    start_trace,
 )
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="core")
 T = TypeVar("T")
@ -145,39 +146,26 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
    ):
        super().__init__()
        self.async_client = AsyncLlamaStackAsLibraryClient(
-            config_path_or_distro_name, custom_provider_registry, provider_data
+            config_path_or_distro_name, custom_provider_registry, provider_data, skip_logger_removal
        )
        self.pool_executor = ThreadPoolExecutor(max_workers=4)
        self.skip_logger_removal = skip_logger_removal
        self.provider_data = provider_data
        self.loop = asyncio.new_event_loop()
    def initialize(self):
        if in_notebook():
            import nest_asyncio
            nest_asyncio.apply()
            if not self.skip_logger_removal:
                self._remove_root_logger_handlers()
        # use a new event loop to avoid interfering with the main event loop
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        try:
-            return loop.run_until_complete(self.async_client.initialize())
+            loop.run_until_complete(self.async_client.initialize())
        finally:
            asyncio.set_event_loop(None)
-    def _remove_root_logger_handlers(self):
+    def initialize(self):
        """
-        Remove all handlers from the root logger. Needed to avoid polluting the console with logs.
+        Deprecated method for backward compatibility.
        """
-        root_logger = logging.getLogger()
+        pass
        for handler in root_logger.handlers[:]:
            root_logger.removeHandler(handler)
            logger.info(f"Removed handler {handler.__class__.__name__} from root logger")
    def request(self, *args, **kwargs):
        loop = self.loop
@ -215,6 +203,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        config_path_or_distro_name: str,
        custom_provider_registry: ProviderRegistry | None = None,
        provider_data: dict[str, Any] | None = None,
        skip_logger_removal: bool = False,
    ):
        super().__init__()
        # when using the library client, we should not log to console since many
@ -222,6 +211,13 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        current_sinks = os.environ.get("TELEMETRY_SINKS", "sqlite").split(",")
        os.environ["TELEMETRY_SINKS"] = ",".join(sink for sink in current_sinks if sink != "console")
        if in_notebook():
            import nest_asyncio
            nest_asyncio.apply()
            if not skip_logger_removal:
                self._remove_root_logger_handlers()
        if config_path_or_distro_name.endswith(".yaml"):
            config_path = Path(config_path_or_distro_name)
            if not config_path.exists():
@ -238,7 +234,24 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        self.provider_data = provider_data
        self.route_impls: RouteImpls | None = None  # Initialize to None to prevent AttributeError
    def _remove_root_logger_handlers(self):
        """
        Remove all handlers from the root logger. Needed to avoid polluting the console with logs.
        """
        root_logger = logging.getLogger()
        for handler in root_logger.handlers[:]:
            root_logger.removeHandler(handler)
            logger.info(f"Removed handler {handler.__class__.__name__} from root logger")
    async def initialize(self) -> bool:
        """
        Initialize the async client.
        Returns:
            bool: True if initialization was successful
        """
        try:
            self.route_impls = None
            self.impls = await construct_stack(self.config, self.custom_provider_registry)
--- a/llama_stack/core/request_headers.py
+++ b/llama_stack/core/request_headers.py
@ -6,15 +6,15 @@
 import contextvars
 import json
 import logging
 from contextlib import AbstractContextManager
 from typing import Any
 from llama_stack.core.datatypes import User
 from llama_stack.log import get_logger
 from .utils.dynamic import instantiate_class_type
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="core")
 # Context variable for request provider data and auth attributes
 PROVIDER_DATA_VAR = contextvars.ContextVar("provider_data", default=None)
--- a/llama_stack/core/resolver.py
+++ b/llama_stack/core/resolver.py
@ -8,6 +8,7 @@ import inspect
 from typing import Any
 from llama_stack.apis.agents import Agents
 from llama_stack.apis.batches import Batches
 from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
@ -75,6 +76,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
        Api.agents: Agents,
        Api.inference: Inference,
        Api.inspect: Inspect,
        Api.batches: Batches,
        Api.vector_io: VectorIO,
        Api.vector_dbs: VectorDBs,
        Api.models: Models,
--- a/llama_stack/core/routers/datasets.py
+++ b/llama_stack/core/routers/datasets.py
@ -12,7 +12,7 @@ from llama_stack.apis.datasets import DatasetPurpose, DataSource
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import RoutingTable
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routers")
 class DatasetIORouter(DatasetIO):
--- a/llama_stack/core/routers/eval_scoring.py
+++ b/llama_stack/core/routers/eval_scoring.py
@ -16,7 +16,7 @@ from llama_stack.apis.scoring import (
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import RoutingTable
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routers")
 class ScoringRouter(Scoring):
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@ -65,7 +65,7 @@ from llama_stack.providers.datatypes import HealthResponse, HealthStatus, Routin
 from llama_stack.providers.utils.inference.inference_store import InferenceStore
 from llama_stack.providers.utils.telemetry.tracing import get_current_span
-logger = get_logger(name=__name__, category="inference")
+logger = get_logger(name=__name__, category="core::routers")
 class InferenceRouter(Inference):
--- a/llama_stack/core/routers/safety.py
+++ b/llama_stack/core/routers/safety.py
@ -6,16 +6,14 @@
 from typing import Any
-from llama_stack.apis.inference import (
+from llama_stack.apis.inference import Message
    Message,
 )
 from llama_stack.apis.safety import RunShieldResponse, Safety
 from llama_stack.apis.safety.safety import ModerationObject
 from llama_stack.apis.shields import Shield
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import RoutingTable
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routers")
 class SafetyRouter(Safety):
@ -68,6 +66,7 @@ class SafetyRouter(Safety):
            list_shields_response = await self.routing_table.list_shields()
            matches = [s.identifier for s in list_shields_response.data if model == s.provider_resource_id]
            if not matches:
                raise ValueError(f"No shield associated with provider_resource id {model}")
            if len(matches) > 1:
--- a/llama_stack/core/routers/tool_runtime.py
+++ b/llama_stack/core/routers/tool_runtime.py
@ -22,7 +22,7 @@ from llama_stack.log import get_logger
 from ..routing_tables.toolgroups import ToolGroupsRoutingTable
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routers")
 class ToolRuntimeRouter(ToolRuntime):
--- a/llama_stack/core/routers/vector_io.py
+++ b/llama_stack/core/routers/vector_io.py
@ -30,7 +30,7 @@ from llama_stack.apis.vector_io import (
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routers")
 class VectorIORouter(VectorIO):
--- a/llama_stack/core/routing_tables/benchmarks.py
+++ b/llama_stack/core/routing_tables/benchmarks.py
@ -14,7 +14,7 @@ from llama_stack.log import get_logger
 from .common import CommonRoutingTableImpl
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routing_tables")
 class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
--- a/llama_stack/core/routing_tables/common.py
+++ b/llama_stack/core/routing_tables/common.py
@ -23,7 +23,7 @@ from llama_stack.core.store import DistributionRegistry
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api, RoutingTable
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routing_tables")
 def get_impl_api(p: Any) -> Api:
--- a/llama_stack/core/routing_tables/datasets.py
+++ b/llama_stack/core/routing_tables/datasets.py
@ -26,7 +26,7 @@ from llama_stack.log import get_logger
 from .common import CommonRoutingTableImpl
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routing_tables")
 class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
--- a/llama_stack/core/routing_tables/models.py
+++ b/llama_stack/core/routing_tables/models.py
@ -17,7 +17,7 @@ from llama_stack.log import get_logger
 from .common import CommonRoutingTableImpl, lookup_model
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routing_tables")
 class ModelsRoutingTable(CommonRoutingTableImpl, Models):
--- a/llama_stack/core/routing_tables/scoring_functions.py
+++ b/llama_stack/core/routing_tables/scoring_functions.py
@ -19,7 +19,7 @@ from llama_stack.log import get_logger
 from .common import CommonRoutingTableImpl
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routing_tables")
 class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
--- a/llama_stack/core/routing_tables/shields.py
+++ b/llama_stack/core/routing_tables/shields.py
@ -15,7 +15,7 @@ from llama_stack.log import get_logger
 from .common import CommonRoutingTableImpl
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routing_tables")
 class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):
--- a/llama_stack/core/routing_tables/toolgroups.py
+++ b/llama_stack/core/routing_tables/toolgroups.py
@ -14,7 +14,7 @@ from llama_stack.log import get_logger
 from .common import CommonRoutingTableImpl
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routing_tables")
 def parse_toolgroup_from_toolgroup_name_pair(toolgroup_name_with_maybe_tool_name: str) -> str | None:
--- a/llama_stack/core/routing_tables/vector_dbs.py
+++ b/llama_stack/core/routing_tables/vector_dbs.py
@ -30,7 +30,7 @@ from llama_stack.log import get_logger
 from .common import CommonRoutingTableImpl, lookup_model
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routing_tables")
 class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
--- a/llama_stack/core/server/auth.py
+++ b/llama_stack/core/server/auth.py
@ -15,7 +15,7 @@ from llama_stack.core.server.auth_providers import create_auth_provider
 from llama_stack.core.server.routes import find_matching_route, initialize_route_impls
 from llama_stack.log import get_logger
-logger = get_logger(name=__name__, category="auth")
+logger = get_logger(name=__name__, category="core::auth")
 class AuthenticationMiddleware:
--- a/llama_stack/core/server/auth_providers.py
+++ b/llama_stack/core/server/auth_providers.py
@ -23,7 +23,7 @@ from llama_stack.core.datatypes import (
 )
 from llama_stack.log import get_logger
-logger = get_logger(name=__name__, category="auth")
+logger = get_logger(name=__name__, category="core::auth")
 class AuthResponse(BaseModel):
--- a/llama_stack/core/server/quota.py
+++ b/llama_stack/core/server/quota.py
@ -15,7 +15,7 @@ from llama_stack.providers.utils.kvstore.api import KVStore
 from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
 from llama_stack.providers.utils.kvstore.kvstore import kvstore_impl
-logger = get_logger(name=__name__, category="quota")
+logger = get_logger(name=__name__, category="core::server")
 class QuotaMiddleware:
--- a/llama_stack/core/server/server.py
+++ b/llama_stack/core/server/server.py
@ -9,7 +9,7 @@ import asyncio
 import functools
 import inspect
 import json
-import logging
+import logging  # allow-direct-logging
 import os
 import ssl
 import sys
@ -28,10 +28,12 @@ from aiohttp import hdrs
 from fastapi import Body, FastAPI, HTTPException, Request, Response
 from fastapi import Path as FastapiPath
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
 from openai import BadRequestError
 from pydantic import BaseModel, ValidationError
 from llama_stack.apis.common.errors import ConflictError, ResourceNotFoundError
 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.cli.utils import add_config_distro_args, get_config_from_args
 from llama_stack.core.access_control.access_control import AccessDeniedError
@ -39,6 +41,7 @@ from llama_stack.core.datatypes import (
    AuthenticationRequiredError,
    LoggingConfig,
    StackRunConfig,
    process_cors_config,
 )
 from llama_stack.core.distribution import builtin_automatically_routed_apis
 from llama_stack.core.external import ExternalApiSpec, load_external_apis
@ -81,7 +84,7 @@ from .quota import QuotaMiddleware
 REPO_ROOT = Path(__file__).parent.parent.parent.parent
-logger = get_logger(name=__name__, category="server")
+logger = get_logger(name=__name__, category="core::server")
 def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
@ -128,6 +131,10 @@ def translate_exception(exc: Exception) -> HTTPException | RequestValidationErro
                ]
            },
        )
    elif isinstance(exc, ConflictError):
        return HTTPException(status_code=409, detail=str(exc))
    elif isinstance(exc, ResourceNotFoundError):
        return HTTPException(status_code=404, detail=str(exc))
    elif isinstance(exc, ValueError):
        return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=f"Invalid value: {str(exc)}")
    elif isinstance(exc, BadRequestError):
@ -408,7 +415,7 @@ def main(args: argparse.Namespace | None = None):
        config_contents = yaml.safe_load(fp)
        if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
            logger_config = LoggingConfig(**cfg)
-        logger = get_logger(name=__name__, category="server", config=logger_config)
+        logger = get_logger(name=__name__, category="core::server", config=logger_config)
        if args.env:
            for env_pair in args.env:
                try:
@ -478,6 +485,12 @@ def main(args: argparse.Namespace | None = None):
            window_seconds=window_seconds,
        )
    if config.server.cors:
        logger.info("Enabling CORS")
        cors_config = process_cors_config(config.server.cors)
        if cors_config:
            app.add_middleware(CORSMiddleware, **cors_config.model_dump())
    if Api.telemetry in impls:
        setup_logger(impls[Api.telemetry])
    else:
--- a/llama_stack/core/store/registry.py
+++ b/llama_stack/core/store/registry.py
@ -16,7 +16,7 @@ from llama_stack.log import get_logger
 from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl
 from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
-logger = get_logger(__name__, category="core")
+logger = get_logger(__name__, category="core::registry")
 class DistributionRegistry(Protocol):
--- a/llama_stack/core/utils/config_resolution.py
+++ b/llama_stack/core/utils/config_resolution.py
@ -10,7 +10,7 @@ from pathlib import Path
 from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR
 from llama_stack.log import get_logger
-logger = get_logger(name=__name__, category="config_resolution")
+logger = get_logger(name=__name__, category="core")
 DISTRO_DIR = Path(__file__).parent.parent.parent.parent / "llama_stack" / "distributions"
--- a/llama_stack/core/utils/exec.py
+++ b/llama_stack/core/utils/exec.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-import logging
+import importlib
 import os
 import signal
 import subprocess
@ -12,9 +12,9 @@ import sys
 from termcolor import cprint
-log = logging.getLogger(__name__)
+from llama_stack.log import get_logger
-import importlib
+log = get_logger(name=__name__, category="core")
 def formulate_run_args(image_type: str, image_name: str) -> list:
--- a/llama_stack/core/utils/prompt_for_config.py
+++ b/llama_stack/core/utils/prompt_for_config.py
@ -6,7 +6,6 @@
 import inspect
 import json
 import logging
 from enum import Enum
 from typing import Annotated, Any, Literal, Union, get_args, get_origin
@ -14,7 +13,9 @@ from pydantic import BaseModel
 from pydantic.fields import FieldInfo
 from pydantic_core import PydanticUndefinedType
-log = logging.getLogger(__name__)
+from llama_stack.log import get_logger
 log = get_logger(name=__name__, category="core")
 def is_list_of_primitives(field_type):
--- a/llama_stack/distributions/ci-tests/build.yaml
+++ b/llama_stack/distributions/ci-tests/build.yaml
@ -28,12 +28,13 @@ distribution_spec:
    - provider_type: inline::localfs
    safety:
    - provider_type: inline::llama-guard
    - provider_type: inline::code-scanner
    agents:
    - provider_type: inline::meta-reference
    telemetry:
    - provider_type: inline::meta-reference
    post_training:
-    - provider_type: inline::huggingface
+    - provider_type: inline::huggingface-cpu
    eval:
    - provider_type: inline::meta-reference
    datasetio:
@ -48,6 +49,8 @@ distribution_spec:
    - provider_type: remote::tavily-search
    - provider_type: inline::rag-runtime
    - provider_type: remote::model-context-protocol
    batches:
    - provider_type: inline::reference
 image_type: venv
 additional_pip_packages:
 - aiosqlite
--- a/llama_stack/distributions/ci-tests/run.yaml
+++ b/llama_stack/distributions/ci-tests/run.yaml
@ -2,6 +2,7 @@ version: 2
 image_name: ci-tests
 apis:
 - agents
 - batches
 - datasetio
 - eval
 - files
@ -134,6 +135,8 @@ providers:
    provider_type: inline::llama-guard
    config:
      excluded_categories: []
  - provider_id: code-scanner
    provider_type: inline::code-scanner
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
@ -153,8 +156,8 @@ providers:
      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/trace_store.db
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
  post_training:
-  - provider_id: huggingface
+  - provider_id: huggingface-cpu
-    provider_type: inline::huggingface
+    provider_type: inline::huggingface-cpu
    config:
      checkpoint_format: huggingface
      distributed_backend: null
@ -204,6 +207,13 @@ providers:
    provider_type: inline::rag-runtime
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
  batches:
  - provider_id: reference
    provider_type: inline::reference
    config:
      kvstore:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/batches.db
 metadata_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/registry.db
@ -215,6 +225,9 @@ shields:
 - shield_id: llama-guard
  provider_id: ${env.SAFETY_MODEL:+llama-guard}
  provider_shield_id: ${env.SAFETY_MODEL:=}
 - shield_id: code-scanner
  provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
  provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
 vector_dbs: []
 datasets: []
 scoring_fns: []
--- a/llama_stack/distributions/starter-gpu/init.py
+++ b/llama_stack/distributions/starter-gpu/init.py
@ -0,0 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .starter_gpu import get_distribution_template  # noqa: F401
--- a/llama_stack/distributions/starter-gpu/build.yaml
+++ b/llama_stack/distributions/starter-gpu/build.yaml
@ -0,0 +1,59 @@
 version: 2
 distribution_spec:
  description: Quick start template for running Llama Stack with several popular providers.
    This distribution is intended for GPU-enabled environments.
  providers:
    inference:
    - provider_type: remote::cerebras
    - provider_type: remote::ollama
    - provider_type: remote::vllm
    - provider_type: remote::tgi
    - provider_type: remote::fireworks
    - provider_type: remote::together
    - provider_type: remote::bedrock
    - provider_type: remote::nvidia
    - provider_type: remote::openai
    - provider_type: remote::anthropic
    - provider_type: remote::gemini
    - provider_type: remote::vertexai
    - provider_type: remote::groq
    - provider_type: remote::sambanova
    - provider_type: inline::sentence-transformers
    vector_io:
    - provider_type: inline::faiss
    - provider_type: inline::sqlite-vec
    - provider_type: inline::milvus
    - provider_type: remote::chromadb
    - provider_type: remote::pgvector
    files:
    - provider_type: inline::localfs
    safety:
    - provider_type: inline::llama-guard
    - provider_type: inline::code-scanner
    agents:
    - provider_type: inline::meta-reference
    telemetry:
    - provider_type: inline::meta-reference
    post_training:
    - provider_type: inline::torchtune-gpu
    eval:
    - provider_type: inline::meta-reference
    datasetio:
    - provider_type: remote::huggingface
    - provider_type: inline::localfs
    scoring:
    - provider_type: inline::basic
    - provider_type: inline::llm-as-judge
    - provider_type: inline::braintrust
    tool_runtime:
    - provider_type: remote::brave-search
    - provider_type: remote::tavily-search
    - provider_type: inline::rag-runtime
    - provider_type: remote::model-context-protocol
    batches:
    - provider_type: inline::reference
 image_type: venv
 additional_pip_packages:
 - aiosqlite
 - asyncpg
 - sqlalchemy[asyncio]
--- a/llama_stack/distributions/starter-gpu/run.yaml
+++ b/llama_stack/distributions/starter-gpu/run.yaml
@ -0,0 +1,238 @@
 version: 2
 image_name: starter-gpu
 apis:
 - agents
 - batches
 - datasetio
 - eval
 - files
 - inference
 - post_training
 - safety
 - scoring
 - telemetry
 - tool_runtime
 - vector_io
 providers:
  inference:
  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
    provider_type: remote::cerebras
    config:
      base_url: https://api.cerebras.ai
      api_key: ${env.CEREBRAS_API_KEY:=}
  - provider_id: ${env.OLLAMA_URL:+ollama}
    provider_type: remote::ollama
    config:
      url: ${env.OLLAMA_URL:=http://localhost:11434}
  - provider_id: ${env.VLLM_URL:+vllm}
    provider_type: remote::vllm
    config:
      url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: ${env.TGI_URL:+tgi}
    provider_type: remote::tgi
    config:
      url: ${env.TGI_URL:=}
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
      url: https://api.fireworks.ai/inference/v1
      api_key: ${env.FIREWORKS_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
      url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
      api_key: ${env.NVIDIA_API_KEY:=}
      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: openai
    provider_type: remote::openai
    config:
      api_key: ${env.OPENAI_API_KEY:=}
      base_url: ${env.OPENAI_BASE_URL:=https://api.openai.com/v1}
  - provider_id: anthropic
    provider_type: remote::anthropic
    config:
      api_key: ${env.ANTHROPIC_API_KEY:=}
  - provider_id: gemini
    provider_type: remote::gemini
    config:
      api_key: ${env.GEMINI_API_KEY:=}
  - provider_id: ${env.VERTEX_AI_PROJECT:+vertexai}
    provider_type: remote::vertexai
    config:
      project: ${env.VERTEX_AI_PROJECT:=}
      location: ${env.VERTEX_AI_LOCATION:=us-central1}
  - provider_id: groq
    provider_type: remote::groq
    config:
      url: https://api.groq.com
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: sambanova
    provider_type: remote::sambanova
    config:
      url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
  vector_io:
  - provider_id: faiss
    provider_type: inline::faiss
    config:
      kvstore:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db
  - provider_id: sqlite-vec
    provider_type: inline::sqlite-vec
    config:
      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db
      kvstore:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db
  - provider_id: ${env.MILVUS_URL:+milvus}
    provider_type: inline::milvus
    config:
      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db
      kvstore:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db
  - provider_id: ${env.CHROMADB_URL:+chromadb}
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
      kvstore:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter/}/chroma_remote_registry.db
  - provider_id: ${env.PGVECTOR_DB:+pgvector}
    provider_type: remote::pgvector
    config:
      host: ${env.PGVECTOR_HOST:=localhost}
      port: ${env.PGVECTOR_PORT:=5432}
      db: ${env.PGVECTOR_DB:=}
      user: ${env.PGVECTOR_USER:=}
      password: ${env.PGVECTOR_PASSWORD:=}
      kvstore:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
      metadata_store:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config:
      excluded_categories: []
  - provider_id: code-scanner
    provider_type: inline::code-scanner
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/agents_store.db
      responses_store:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/responses_store.db
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/trace_store.db
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
  post_training:
  - provider_id: torchtune-gpu
    provider_type: inline::torchtune-gpu
    config:
      checkpoint_format: meta
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      kvstore:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/meta_reference_eval.db
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config:
      kvstore:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/huggingface_datasetio.db
  - provider_id: localfs
    provider_type: inline::localfs
    config:
      kvstore:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/localfs_datasetio.db
  scoring:
  - provider_id: basic
    provider_type: inline::basic
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
  - provider_id: braintrust
    provider_type: inline::braintrust
    config:
      openai_api_key: ${env.OPENAI_API_KEY:=}
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
    config:
      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
      max_results: 3
  - provider_id: tavily-search
    provider_type: remote::tavily-search
    config:
      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
      max_results: 3
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
  batches:
  - provider_id: reference
    provider_type: inline::reference
    config:
      kvstore:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/batches.db
 metadata_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/registry.db
 inference_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/inference_store.db
 models: []
 shields:
 - shield_id: llama-guard
  provider_id: ${env.SAFETY_MODEL:+llama-guard}
  provider_shield_id: ${env.SAFETY_MODEL:=}
 - shield_id: code-scanner
  provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
  provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
 vector_dbs: []
 datasets: []
 scoring_fns: []
 benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
  provider_id: tavily-search
 - toolgroup_id: builtin::rag
  provider_id: rag-runtime
 server:
  port: 8321
--- a/Show more
+++ b/Show more
`@ -1,2 +1,2 @@`
	`# This file documents Triage members in the Llama Stack community`	`# This file documents Triage members in the Llama Stack community`
	`@bbrowning @franciscojavierarceo @leseb`	`@franciscojavierarceo`