Release candidate 0.2.17rc1

2025-08-21 17:33:12 +00:00 · 2025-08-05 01:31:58 +00:00
386 changed files with 22764 additions and 24955 deletions
--- a/.github/TRIAGERS.md
+++ b/.github/TRIAGERS.md
@ -1,2 +1,2 @@
 # This file documents Triage members in the Llama Stack community
- @franciscojavierarceo
+ @bbrowning @franciscojavierarceo @leseb
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -2,13 +2,9 @@ name: 'Run and Record Tests'
 description: 'Run integration tests and handle recording/artifact upload'
 inputs:
-  test-subdirs:
+  test-types:
-    description: 'Comma-separated list of test subdirectories to run'
+    description: 'JSON array of test types to run'
    required: true
  test-pattern:
    description: 'Regex pattern to pass to pytest -k'
    required: false
    default: ''
  stack-config:
    description: 'Stack configuration to use'
    required: true
@ -36,14 +32,12 @@ runs:
    - name: Run Integration Tests
      shell: bash
      run: |
-        uv run --no-sync ./scripts/integration-tests.sh \
+        ./scripts/integration-tests.sh \
          --stack-config '${{ inputs.stack-config }}' \
          --provider '${{ inputs.provider }}' \
-          --test-subdirs '${{ inputs.test-subdirs }}' \
+          --test-types '${{ inputs.test-types }}' \
          --test-pattern '${{ inputs.test-pattern }}' \
          --inference-mode '${{ inputs.inference-mode }}' \
-          ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} \
+          ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }}
          | tee pytest-${{ inputs.inference-mode }}.log
    - name: Commit and push recordings
@ -63,10 +57,10 @@ runs:
            git commit -m "Recordings update from CI"
          fi
-          git fetch origin ${{ github.ref_name }}
+          git fetch origin ${{ github.event.pull_request.head.ref }}
-          git rebase origin/${{ github.ref_name }}
+          git rebase origin/${{ github.event.pull_request.head.ref }}
          echo "Rebased successfully"
-          git push origin HEAD:${{ github.ref_name }}
+          git push origin HEAD:${{ github.event.pull_request.head.ref }}
          echo "Pushed successfully"
        else
          echo "No recording changes"
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -16,21 +16,19 @@ runs:
      uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
      with:
        python-version: ${{ inputs.python-version }}
        activate-environment: true
        version: 0.7.6
    - name: Install dependencies
      shell: bash
      run: |
        echo "Updating project dependencies via uv sync"
        uv sync --all-groups
-
+        uv pip install ollama faiss-cpu
        echo "Installing ad-hoc dependencies"
        uv pip install faiss-cpu
        # Install llama-stack-client-python based on the client-version input
        if [ "${{ inputs.client-version }}" = "latest" ]; then
          echo "Installing latest llama-stack-client-python from main branch"
-          uv pip install git+https://github.com/llamastack/llama-stack-client-python.git@main
+          uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
        elif [ "${{ inputs.client-version }}" = "published" ]; then
          echo "Installing published llama-stack-client-python from PyPI"
          uv pip install llama-stack-client
@ -39,5 +37,4 @@ runs:
          exit 1
        fi
-        echo "Installed llama packages"
+        uv pip install -e .
        uv pip list | grep llama
--- a/.github/actions/setup-test-environment/action.yml
+++ b/.github/actions/setup-test-environment/action.yml
@ -42,22 +42,7 @@ runs:
    - name: Build Llama Stack
      shell: bash
      run: |
-        # Install llama-stack-client-python based on the client-version input
+        uv run llama stack build --template ci-tests --image-type venv
        if [ "${{ inputs.client-version }}" = "latest" ]; then
          echo "Installing latest llama-stack-client-python from main branch"
          export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@main
        elif [ "${{ inputs.client-version }}" = "published" ]; then
          echo "Installing published llama-stack-client-python from PyPI"
          unset LLAMA_STACK_CLIENT_DIR
        else
          echo "Invalid client-version: ${{ inputs.client-version }}"
          exit 1
        fi
        echo "Building Llama Stack"
        LLAMA_STACK_DIR=. \
          uv run --no-sync llama stack build --template ci-tests --image-type venv
    - name: Configure git for commits
      shell: bash
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -9,7 +9,6 @@ updates:
      day: "saturday"
    commit-message:
      prefix: chore(github-deps)
  - package-ecosystem: "uv"
    directory: "/"
    schedule:
@ -20,14 +19,3 @@ updates:
      - python
    commit-message:
      prefix: chore(python-deps)
  - package-ecosystem: npm
    directory: "/llama_stack/ui"
    schedule:
      interval: "weekly"
      day: "saturday"
    labels:
      - type/dependencies
      - javascript
    commit-message:
      prefix: chore(ui-deps)
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -18,6 +18,5 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action |
 | Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module |
 | Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |
 | UI Tests | [ui-unit-tests.yml](ui-unit-tests.yml) | Run the UI test suite |
 | Unit Tests | [unit-tests.yml](unit-tests.yml) | Run the unit test suite |
 | Update ReadTheDocs | [update-readthedocs.yml](update-readthedocs.yml) | Update the Llama Stack ReadTheDocs site |
--- a/.github/workflows/changelog.yml
+++ b/.github/workflows/changelog.yml
@ -17,7 +17,7 @@ jobs:
      pull-requests: write  # for peter-evans/create-pull-request to create a PR
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          ref: main
          fetch-depth: 0
--- a/.github/workflows/install-script-ci.yml
+++ b/.github/workflows/install-script-ci.yml
@ -16,22 +16,21 @@ jobs:
  lint:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # 5.0.0
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
      - name: Run ShellCheck on install.sh
        run: shellcheck scripts/install.sh
  smoke-test-on-dev:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Build a single provider
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync \
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template starter --image-type container --image-name test
            llama stack build --template starter --image-type container --image-name test
      - name: Run installer end-to-end
        run: |
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -10,7 +10,6 @@ on:
    paths:
      - 'distributions/**'
      - 'llama_stack/**'
      - '!llama_stack/ui/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -31,7 +30,7 @@ jobs:
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.github/workflows/integration-sql-store-tests.yml
+++ b/.github/workflows/integration-sql-store-tests.yml
@ -44,7 +44,7 @@ jobs:
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -10,7 +10,6 @@ on:
    types: [opened, synchronize, reopened]
    paths:
      - 'llama_stack/**'
      - '!llama_stack/ui/**'
      - 'tests/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -32,14 +31,6 @@ on:
        description: 'Test against a specific provider'
        type: string
        default: 'ollama'
      test-subdirs:
        description: 'Comma-separated list of test subdirectories to run'
        type: string
        default: ''
      test-pattern:
        description: 'Regex pattern to pass to pytest -k'
        type: string
        default: ''
 concurrency:
  # Skip concurrency for pushes to main - each commit should be tested independently
@ -47,8 +38,27 @@ concurrency:
  cancel-in-progress: true
 jobs:
  discover-tests:
    runs-on: ubuntu-latest
    outputs:
      test-types: ${{ steps.generate-test-types.outputs.test-types }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Generate test types
        id: generate-test-types
        run: |
          # Get test directories dynamically, excluding non-test directories
          # NOTE: we are excluding post_training since the tests take too long
          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" |
            sort | jq -R -s -c 'split("\n")[:-1]')
          echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
  run-replay-mode-tests:
    needs: discover-tests
    runs-on: ubuntu-latest
    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}
@ -65,7 +75,7 @@ jobs:
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Setup test environment
        uses: ./.github/actions/setup-test-environment
@ -79,8 +89,7 @@ jobs:
      - name: Run tests
        uses: ./.github/actions/run-and-record-tests
        with:
-          test-subdirs: ${{ inputs.test-subdirs }}
+          test-types: ${{ needs.discover-tests.outputs.test-types }}
          test-pattern: ${{ inputs.test-pattern }}
          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
          provider: ${{ matrix.provider }}
          inference-mode: 'replay'
--- a/.github/workflows/integration-vector-io-tests.yml
+++ b/.github/workflows/integration-vector-io-tests.yml
@ -9,17 +9,14 @@ on:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
      - '!llama_stack/ui/**'
      - 'tests/integration/vector_io/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/integration-vector-io-tests.yml' # This workflow
  schedule:
    - cron: '0 0 * * *'  # (test on python 3.13) Daily at 12 AM UTC
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
@ -28,12 +25,12 @@ jobs:
    strategy:
      matrix:
        vector-io-provider: ["inline::faiss", "inline::sqlite-vec", "inline::milvus", "remote::chromadb", "remote::pgvector", "remote::weaviate", "remote::qdrant"]
-        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
+        python-version: ["3.12", "3.13"]
      fail-fast: false # we want to run all tests regardless of failure
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -144,7 +141,7 @@ jobs:
      - name: Build Llama Stack
        run: |
-          uv run --no-sync llama stack build --template ci-tests --image-type venv
+          uv run llama stack build --template ci-tests --image-type venv
      - name: Check Storage and Memory Available Before Tests
        if: ${{ always() }}
@ -167,10 +164,9 @@ jobs:
          ENABLE_WEAVIATE: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'true' || '' }}
          WEAVIATE_CLUSTER_URL: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'localhost:8080' || '' }}
        run: |
-          uv run --no-sync \
+          uv run pytest -sv --stack-config="inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
            pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
            tests/integration/vector_io \
-            --embedding-model inline::sentence-transformers/all-MiniLM-L6-v2
+            --embedding-model sentence-transformers/all-MiniLM-L6-v2
      - name: Check Storage and Memory Available After Tests
        if: ${{ always() }}
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -20,7 +20,7 @@ jobs:
    steps:
      - name: Checkout code
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          # For dependabot PRs, we need to checkout with a token that can push changes
          token: ${{ github.actor == 'dependabot[bot]' && secrets.GITHUB_TOKEN || github.token }}
@ -36,21 +36,6 @@ jobs:
            **/requirements*.txt
            .pre-commit-config.yaml
      # npm ci may fail -
      #   npm error `npm ci` can only install packages when your package.json and package-lock.json or npm-shrinkwrap.json are in sync. Please update your lock file with `npm install` before continuing.
      #   npm error Invalid: lock file's llama-stack-client@0.2.17 does not satisfy llama-stack-client@0.2.18
      # - name: Set up Node.js
      #   uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
      #   with:
      #     node-version: '20'
      #     cache: 'npm'
      #     cache-dependency-path: 'llama_stack/ui/'
      # - name: Install npm dependencies
      #   run: npm ci
      #   working-directory: llama_stack/ui
      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
        continue-on-error: true
        env:
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -36,7 +36,7 @@ jobs:
      distros: ${{ steps.set-matrix.outputs.distros }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Generate Distribution List
        id: set-matrix
@ -55,7 +55,7 @@ jobs:
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -79,7 +79,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -92,7 +92,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -117,7 +117,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -9,8 +9,6 @@ on:
  pull_request:
    branches:
      - main
    paths-ignore:
        - 'llama_stack/ui/**'
 jobs:
  build:
@ -21,10 +19,10 @@ jobs:
    steps:
    - name: Checkout repository
-      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
    - name: Install uv
-      uses: astral-sh/setup-uv@d9e0f98d3fc6adb07d1e3d37f3043649ddad06a1 # v6.5.0
+      uses: astral-sh/setup-uv@e92bafb6253dcd438e0484186d7669ea7a8ca1cc # v6.4.3
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
--- a/.github/workflows/record-integration-tests.yml
+++ b/.github/workflows/record-integration-tests.yml
@ -1,53 +1,93 @@
 # This workflow should be run manually when needing to re-record tests. This happens when you have
 #  - added a new test
 #  - or changed an existing test such that a new inference call is made
 # You should make a PR and then run this workflow on that PR branch. The workflow will re-record the
 # tests and commit the recordings to the PR branch.
 name: Integration Tests (Record)
 run-name: Run the integration test suite from tests/integration
 on:
  pull_request:
    branches: [ main ]
    types: [opened, synchronize, labeled]
    paths:
      - 'llama_stack/**'
      - 'tests/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - '.github/workflows/record-integration-tests.yml' # This workflow
      - '.github/actions/setup-ollama/action.yml'
      - '.github/actions/setup-test-environment/action.yml'
      - '.github/actions/run-and-record-tests/action.yml'
  workflow_dispatch:
    inputs:
      test-subdirs:
        description: 'Comma-separated list of test subdirectories to run'
        type: string
        default: ''
      test-provider:
        description: 'Test against a specific provider'
        type: string
        default: 'ollama'
-      run-vision-tests:
+
-        description: 'Whether to run vision tests'
+concurrency:
-        type: boolean
+  group: ${{ github.workflow }}-${{ github.ref }}
-        default: false
+  cancel-in-progress: true
      test-pattern:
        description: 'Regex pattern to pass to pytest -k'
        type: string
        default: ''
 jobs:
  discover-tests:
    if: contains(github.event.pull_request.labels.*.name, 're-record-tests') ||
      contains(github.event.pull_request.labels.*.name, 're-record-vision-tests')
    runs-on: ubuntu-latest
    outputs:
      test-types: ${{ steps.generate-test-types.outputs.test-types }}
      matrix-modes: ${{ steps.generate-test-types.outputs.matrix-modes }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Generate test types
        id: generate-test-types
        run: |
          # Get test directories dynamically, excluding non-test directories
          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" |
            sort | jq -R -s -c 'split("\n")[:-1]')
          echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
          labels=$(gh pr view ${{ github.event.pull_request.number }} --json labels --jq '.labels[].name')
          echo "labels=$labels"
          modes_array=()
          if [[ $labels == *"re-record-vision-tests"* ]]; then
            modes_array+=("vision")
          fi
          if [[ $labels == *"re-record-tests"* ]]; then
            modes_array+=("non-vision")
          fi
          # Convert to JSON array
          if [ ${#modes_array[@]} -eq 0 ]; then
            matrix_modes="[]"
          else
            matrix_modes=$(printf '%s\n' "${modes_array[@]}" | jq -R -s -c 'split("\n")[:-1]')
          fi
          echo "matrix_modes=$matrix_modes"
          echo "matrix-modes=$matrix_modes" >> $GITHUB_OUTPUT
        env:
          GH_TOKEN: ${{ github.token }}
  record-tests:
    needs: discover-tests
    runs-on: ubuntu-latest
    permissions:
      contents: write
-    steps:
+    strategy:
-      - name: Echo workflow inputs
+      fail-fast: false
-        run: |
+      matrix:
-          echo "::group::Workflow Inputs"
+        mode: ${{ fromJSON(needs.discover-tests.outputs.matrix-modes) }}
          echo "test-subdirs: ${{ inputs.test-subdirs }}"
          echo "test-provider: ${{ inputs.test-provider }}"
          echo "run-vision-tests: ${{ inputs.run-vision-tests }}"
          echo "test-pattern: ${{ inputs.test-pattern }}"
          echo "branch: ${{ github.ref_name }}"
          echo "::endgroup::"
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          ref: ${{ github.event.pull_request.head.ref }}
          fetch-depth: 0
      - name: Setup test environment
@ -56,15 +96,14 @@ jobs:
          python-version: "3.12"  # Use single Python version for recording
          client-version: "latest"
          provider: ${{ inputs.test-provider || 'ollama' }}
-          run-vision-tests: ${{ inputs.run-vision-tests }}
+          run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
          inference-mode: 'record'
      - name: Run and record tests
        uses: ./.github/actions/run-and-record-tests
        with:
-          test-pattern: ${{ inputs.test-pattern }}
+          test-types: ${{ needs.discover-tests.outputs.test-types }}
          test-subdirs: ${{ inputs.test-subdirs }}
          stack-config: 'server:ci-tests'  # recording must be done with server since more tests are run
          provider: ${{ inputs.test-provider || 'ollama' }}
          inference-mode: 'record'
-          run-vision-tests: ${{ inputs.run-vision-tests }}
+          run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
--- a/.github/workflows/semantic-pr.yml
+++ b/.github/workflows/semantic-pr.yml
@ -11,7 +11,7 @@ on:
      - synchronize
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 permissions:
@ -22,6 +22,6 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Check PR Title's semantic conformance
-        uses: amannn/action-semantic-pull-request@7f33ba792281b034f64e96f4c0b5496782dd3b37 # v6.1.0
+        uses: amannn/action-semantic-pull-request@0723387faaf9b38adef4775cd42cfd5155ed6017 # v5.5.3
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/test-external-provider-module.yml
+++ b/.github/workflows/test-external-provider-module.yml
@ -27,7 +27,7 @@ jobs:
        # container and point 'uv pip install' to the correct path...
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.github/workflows/test-external.yml
+++ b/.github/workflows/test-external.yml
@ -9,7 +9,6 @@ on:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
      - '!llama_stack/ui/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -27,7 +26,7 @@ jobs:
        # container and point 'uv pip install' to the correct path...
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -44,11 +43,11 @@ jobs:
      - name: Print distro dependencies
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml --print-deps-only
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/build.yaml --print-deps-only
      - name: Build distro from config file
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/build.yaml
      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
--- a/.github/workflows/ui-unit-tests.yml
+++ b/.github/workflows/ui-unit-tests.yml
@ -1,55 +0,0 @@
 name: UI Tests
 run-name: Run the UI test suite
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
    paths:
      - 'llama_stack/ui/**'
      - '.github/workflows/ui-unit-tests.yml' # This workflow
  workflow_dispatch:
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  ui-tests:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        node-version: [22]
    steps:
      - name: Checkout repository
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Setup Node.js
        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
        with:
          node-version: ${{ matrix.node-version }}
          cache: 'npm'
          cache-dependency-path: 'llama_stack/ui/package-lock.json'
      - name: Install dependencies
        working-directory: llama_stack/ui
        run: npm ci
      - name: Run linting
        working-directory: llama_stack/ui
        run: npm run lint
      - name: Run format check
        working-directory: llama_stack/ui
        run: npm run format:check
      - name: Run unit tests
        working-directory: llama_stack/ui
        env:
          CI: true
        run: npm test -- --coverage --watchAll=false --passWithNoTests
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@ -9,7 +9,6 @@ on:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
      - '!llama_stack/ui/**'
      - 'tests/unit/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -32,7 +31,7 @@ jobs:
          - "3.13"
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.github/workflows/update-readthedocs.yml
+++ b/.github/workflows/update-readthedocs.yml
@ -37,7 +37,7 @@ jobs:
      TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -2,7 +2,6 @@ exclude: 'build/'
 default_language_version:
    python: python3.12
    node: "22"
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
@ -146,50 +145,6 @@ repos:
        pass_filenames: false
        require_serial: true
        files: ^.github/workflows/.*$
      # ui-prettier and ui-eslint are disabled until we can avoid `npm ci`, which is slow and may fail -
      #   npm error `npm ci` can only install packages when your package.json and package-lock.json or npm-shrinkwrap.json are in sync. Please update your lock file with `npm install` before continuing.
      #   npm error Invalid: lock file's llama-stack-client@0.2.17 does not satisfy llama-stack-client@0.2.18
      # and until we have infra for installing prettier and next via npm -
      #   Lint UI code with ESLint.....................................................Failed
      #   - hook id: ui-eslint
      #   - exit code: 127
      #   > ui@0.1.0 lint
      #   > next lint --fix --quiet
      #   sh: line 1: next: command not found
      #
      # - id: ui-prettier
      #   name: Format UI code with Prettier
      #   entry: bash -c 'cd llama_stack/ui && npm ci && npm run format'
      #   language: system
      #   files: ^llama_stack/ui/.*\.(ts|tsx)$
      #   pass_filenames: false
      #   require_serial: true
      # - id: ui-eslint
      #   name: Lint UI code with ESLint
      #   entry: bash -c 'cd llama_stack/ui && npm run lint -- --fix --quiet'
      #   language: system
      #   files: ^llama_stack/ui/.*\.(ts|tsx)$
      #   pass_filenames: false
      #   require_serial: true
      - id: check-log-usage
        name: Ensure 'llama_stack.log' usage for logging
        entry: bash
        language: system
        types: [python]
        pass_filenames: true
        args:
          - -c
          - |
            matches=$(grep -EnH '^[^#]*\b(import\s+logging|from\s+logging\b)' "$@" | grep -v -e '#\s*allow-direct-logging' || true)
            if [ -n "$matches" ]; then
              # GitHub Actions annotation format
              while IFS=: read -r file line_num rest; do
                echo "::error file=$file,line=$line_num::Do not use 'import logging' or 'from logging import' in $file. Use the custom log instead: from llama_stack.log import get_logger; logger = get_logger(). If direct logging is truly needed, add: # allow-direct-logging"
              done <<< "$matches"
              exit 1
            fi
            exit 0
 ci:
    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,82 +1,13 @@
-# Contributing to Llama Stack
+# Contributing to Llama-Stack
 We want to make contributing to this project as easy and transparent as
 possible.
 ## Set up your development environment
 We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
 You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
 You can install the dependencies by running:
 ```bash
 cd llama-stack
 uv sync --group dev
 uv pip install -e .
 source .venv/bin/activate
 ```
 ```{note}
 You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.12`).
 Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
 For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
 ```
 Note that you can create a dotenv file `.env` that includes necessary environment variables:
 ```
 LLAMA_STACK_BASE_URL=http://localhost:8321
 LLAMA_STACK_CLIENT_LOG=debug
 LLAMA_STACK_PORT=8321
 LLAMA_STACK_CONFIG=<provider-name>
 TAVILY_SEARCH_API_KEY=
 BRAVE_SEARCH_API_KEY=
 ```
 And then use this dotenv file when running client SDK tests via the following:
 ```bash
 uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
 ```
 ### Pre-commit Hooks
 We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
 ```bash
 uv run pre-commit install
 ```
 After that, pre-commit hooks will run automatically before each commit.
 Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
 ```bash
 uv run pre-commit run --all-files
 ```
 ```{caution}
 Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
 ```
 ## Discussions -> Issues -> Pull Requests
 We actively welcome your pull requests. However, please read the following. This is heavily inspired by [Ghostty](https://github.com/ghostty-org/ghostty/blob/main/CONTRIBUTING.md).
 If in doubt, please open a [discussion](https://github.com/meta-llama/llama-stack/discussions); we can always convert that to an issue later.
 ### Issues
 We use GitHub issues to track public bugs. Please ensure your description is
 clear and has sufficient instructions to be able to reproduce the issue.
 Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
 disclosure of security bugs. In those cases, please go through the process
 outlined on that page and do not file a public issue.
 ### Contributor License Agreement ("CLA")
 In order to accept your pull request, we need you to submit a CLA. You only need
 to do this once to work on any of Meta's open source projects.
 Complete your CLA here: <https://code.facebook.com/cla>
 **I'd like to contribute!**
 If you are new to the project, start by looking at the issues tagged with "good first issue". If you're interested
@ -120,15 +51,93 @@ Please avoid picking up too many issues at once. This helps you stay focused and
 Please keep pull requests (PRs) small and focused. If you have a large set of changes, consider splitting them into logically grouped, smaller PRs to facilitate review and testing.
-```{tip}
+> [!TIP]
-As a general guideline:
+> As a general guideline:
- Experienced contributors should try to keep no more than 5 open PRs at a time.
+> - Experienced contributors should try to keep no more than 5 open PRs at a time.
- New contributors are encouraged to have only one open PR at a time until they’re familiar with the codebase and process.
+> - New contributors are encouraged to have only one open PR at a time until they’re familiar with the codebase and process.
 ## Contributor License Agreement ("CLA")
 In order to accept your pull request, we need you to submit a CLA. You only need
 to do this once to work on any of Meta's open source projects.
 Complete your CLA here: <https://code.facebook.com/cla>
 ## Issues
 We use GitHub issues to track public bugs. Please ensure your description is
 clear and has sufficient instructions to be able to reproduce the issue.
 Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
 disclosure of security bugs. In those cases, please go through the process
 outlined on that page and do not file a public issue.
 ## Set up your development environment
 We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
 You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
 You can install the dependencies by running:
 ```bash
 cd llama-stack
 uv sync --group dev
 uv pip install -e .
 source .venv/bin/activate
 ```
-## Repository guidelines
+> [!NOTE]
 > You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.12`)
 > Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
 > For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
-### Coding Style
+Note that you can create a dotenv file `.env` that includes necessary environment variables:
 ```
 LLAMA_STACK_BASE_URL=http://localhost:8321
 LLAMA_STACK_CLIENT_LOG=debug
 LLAMA_STACK_PORT=8321
 LLAMA_STACK_CONFIG=<provider-name>
 TAVILY_SEARCH_API_KEY=
 BRAVE_SEARCH_API_KEY=
 ```
 And then use this dotenv file when running client SDK tests via the following:
 ```bash
 uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
 ```
 ## Pre-commit Hooks
 We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
 ```bash
 uv run pre-commit install
 ```
 After that, pre-commit hooks will run automatically before each commit.
 Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
 ```bash
 uv run pre-commit run --all-files
 ```
 > [!CAUTION]
 > Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
 ## Running tests
 You can find the Llama Stack testing documentation [here](https://github.com/meta-llama/llama-stack/blob/main/tests/README.md).
 ## Adding a new dependency to the project
 To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:
 ```bash
 uv add foo
 uv sync
 ```
 ## Coding Style
 * Comments should provide meaningful insights into the code. Avoid filler comments that simply
  describe the next step, as they create unnecessary clutter, same goes for docstrings.
@ -148,11 +157,6 @@ As a general guideline:
  that describes the configuration. These descriptions will be used to generate the provider
  documentation.
 * When possible, use keyword arguments only when calling functions.
 * Llama Stack utilizes [custom Exception classes](llama_stack/apis/common/errors.py) for certain Resources that should be used where applicable.
 ### License
 By contributing to Llama, you agree that your contributions will be licensed
 under the LICENSE file in the root directory of this source tree.
 ## Common Tasks
@ -206,3 +210,7 @@ uv run ./docs/openapi_generator/run_openapi_generator.sh
 ```
 The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.
 ## License
 By contributing to Llama, you agree that your contributions will be licensed
 under the LICENSE file in the root directory of this source tree.
--- a/README.md
+++ b/README.md
@ -9,7 +9,6 @@
 [**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
 ### ✨🎉 Llama 4 Support  🎉✨
 We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
@ -180,17 +179,3 @@ Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest
 Check out our client SDKs for connecting to a Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [typescript](https://github.com/meta-llama/llama-stack-client-typescript), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
 You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.
 ## 🌟 GitHub Star History
 ## Star History
 [![Star History Chart](https://api.star-history.com/svg?repos=meta-llama/llama-stack&type=Date)](https://www.star-history.com/#meta-llama/llama-stack&Date)
 ## ✨ Contributors
 Thanks to all of our amazing contributors!
 <a href="https://github.com/meta-llama/llama-stack/graphs/contributors">
  <img src="https://contrib.rocks/image?repo=meta-llama/llama-stack" />
 </a>
--- a/docs/_static/js/keyboard_shortcuts.js
+++ b/docs/_static/js/keyboard_shortcuts.js
@ -1,14 +0,0 @@
 document.addEventListener('keydown', function(event) {
  // command+K or ctrl+K
  if ((event.metaKey || event.ctrlKey) && event.key === 'k') {
    event.preventDefault();
    document.querySelector('.search-input, .search-field, input[name="q"]').focus();
  }
  // forward slash
  if (event.key === '/' &&
      !event.target.matches('input, textarea, select')) {
    event.preventDefault();
    document.querySelector('.search-input, .search-field, input[name="q"]').focus();
  }
 });
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -1452,40 +1452,6 @@
                        }
                    }
                ]
            },
            "delete": {
                "responses": {
                    "200": {
                        "description": "OK"
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Shields"
                ],
                "description": "Unregister a shield.",
                "parameters": [
                    {
                        "name": "identifier",
                        "in": "path",
                        "description": "The identifier of the shield to unregister.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ]
            }
        },
        "/v1/telemetry/traces/{trace_id}/spans/{span_id}": {
@ -4734,49 +4700,6 @@
                }
            }
        },
        "/v1/openai/v1/moderations": {
            "post": {
                "responses": {
                    "200": {
                        "description": "A moderation object.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/ModerationObject"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Safety"
                ],
                "description": "Classifies if text and/or image inputs are potentially harmful.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/RunModerationRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/safety/run-shield": {
            "post": {
                "responses": {
@ -8293,60 +8216,28 @@
                        "type": "array",
                        "items": {
                            "type": "object",
-                            "properties": {
+                            "additionalProperties": {
-                                "attributes": {
+                                "oneOf": [
-                                    "type": "object",
+                                    {
-                                    "additionalProperties": {
+                                        "type": "null"
                                        "oneOf": [
                                            {
                                                "type": "null"
                                            },
                                            {
                                                "type": "boolean"
                                            },
                                            {
                                                "type": "number"
                                            },
                                            {
                                                "type": "string"
                                            },
                                            {
                                                "type": "array"
                                            },
                                            {
                                                "type": "object"
                                            }
                                        ]
                                    },
-                                    "description": "(Optional) Key-value attributes associated with the file"
+                                    {
-                                },
+                                        "type": "boolean"
-                                "file_id": {
+                                    },
-                                    "type": "string",
+                                    {
-                                    "description": "Unique identifier of the file containing the result"
+                                        "type": "number"
-                                },
+                                    },
-                                "filename": {
+                                    {
-                                    "type": "string",
+                                        "type": "string"
-                                    "description": "Name of the file containing the result"
+                                    },
-                                },
+                                    {
-                                "score": {
+                                        "type": "array"
-                                    "type": "number",
+                                    },
-                                    "description": "Relevance score for this search result (between 0 and 1)"
+                                    {
-                                },
+                                        "type": "object"
-                                "text": {
+                                    }
-                                    "type": "string",
+                                ]
-                                    "description": "Text content of the search result"
+                            }
                                }
                            },
                            "additionalProperties": false,
                            "required": [
                                "attributes",
                                "file_id",
                                "filename",
                                "score",
                                "text"
                            ],
                            "title": "OpenAIResponseOutputMessageFileSearchToolCallResults",
                            "description": "Search results returned by the file search operation."
                        },
                        "description": "(Optional) Search results returned by the file search operation"
                    }
@ -8547,13 +8438,6 @@
                            "$ref": "#/components/schemas/OpenAIResponseInputTool"
                        }
                    },
                    "include": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        },
                        "description": "(Optional) Additional fields to include in the response."
                    },
                    "max_infer_iters": {
                        "type": "integer"
                    }
@ -8821,61 +8705,6 @@
                "title": "OpenAIResponseOutputMessageMCPListTools",
                "description": "MCP list tools output message containing available tools from an MCP server."
            },
            "OpenAIResponseContentPart": {
                "oneOf": [
                    {
                        "$ref": "#/components/schemas/OpenAIResponseContentPartOutputText"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseContentPartRefusal"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "output_text": "#/components/schemas/OpenAIResponseContentPartOutputText",
                        "refusal": "#/components/schemas/OpenAIResponseContentPartRefusal"
                    }
                }
            },
            "OpenAIResponseContentPartOutputText": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "output_text",
                        "default": "output_text"
                    },
                    "text": {
                        "type": "string"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "text"
                ],
                "title": "OpenAIResponseContentPartOutputText"
            },
            "OpenAIResponseContentPartRefusal": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "refusal",
                        "default": "refusal"
                    },
                    "refusal": {
                        "type": "string"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "refusal"
                ],
                "title": "OpenAIResponseContentPartRefusal"
            },
            "OpenAIResponseObjectStream": {
                "oneOf": [
                    {
@ -8932,12 +8761,6 @@
                    {
                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
                    }
@ -8963,8 +8786,6 @@
                        "response.mcp_call.in_progress": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress",
                        "response.mcp_call.failed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed",
                        "response.mcp_call.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted",
                        "response.content_part.added": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded",
                        "response.content_part.done": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone",
                        "response.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
                    }
                }
@ -8991,80 +8812,6 @@
                "title": "OpenAIResponseObjectStreamResponseCompleted",
                "description": "Streaming event indicating a response has been completed."
            },
            "OpenAIResponseObjectStreamResponseContentPartAdded": {
                "type": "object",
                "properties": {
                    "response_id": {
                        "type": "string",
                        "description": "Unique identifier of the response containing this content"
                    },
                    "item_id": {
                        "type": "string",
                        "description": "Unique identifier of the output item containing this content part"
                    },
                    "part": {
                        "$ref": "#/components/schemas/OpenAIResponseContentPart",
                        "description": "The content part that was added"
                    },
                    "sequence_number": {
                        "type": "integer",
                        "description": "Sequential number for ordering streaming events"
                    },
                    "type": {
                        "type": "string",
                        "const": "response.content_part.added",
                        "default": "response.content_part.added",
                        "description": "Event type identifier, always \"response.content_part.added\""
                    }
                },
                "additionalProperties": false,
                "required": [
                    "response_id",
                    "item_id",
                    "part",
                    "sequence_number",
                    "type"
                ],
                "title": "OpenAIResponseObjectStreamResponseContentPartAdded",
                "description": "Streaming event for when a new content part is added to a response item."
            },
            "OpenAIResponseObjectStreamResponseContentPartDone": {
                "type": "object",
                "properties": {
                    "response_id": {
                        "type": "string",
                        "description": "Unique identifier of the response containing this content"
                    },
                    "item_id": {
                        "type": "string",
                        "description": "Unique identifier of the output item containing this content part"
                    },
                    "part": {
                        "$ref": "#/components/schemas/OpenAIResponseContentPart",
                        "description": "The completed content part"
                    },
                    "sequence_number": {
                        "type": "integer",
                        "description": "Sequential number for ordering streaming events"
                    },
                    "type": {
                        "type": "string",
                        "const": "response.content_part.done",
                        "default": "response.content_part.done",
                        "description": "Event type identifier, always \"response.content_part.done\""
                    }
                },
                "additionalProperties": false,
                "required": [
                    "response_id",
                    "item_id",
                    "part",
                    "sequence_number",
                    "type"
                ],
                "title": "OpenAIResponseObjectStreamResponseContentPartDone",
                "description": "Streaming event for when a content part is completed."
            },
            "OpenAIResponseObjectStreamResponseCreated": {
                "type": "object",
                "properties": {
@ -14767,8 +14514,7 @@
            "OpenAIFilePurpose": {
                "type": "string",
                "enum": [
-                    "assistants",
+                    "assistants"
                    "batch"
                ],
                "title": "OpenAIFilePurpose",
                "description": "Valid purpose values for OpenAI Files API."
@ -14845,8 +14591,7 @@
                    "purpose": {
                        "type": "string",
                        "enum": [
-                            "assistants",
+                            "assistants"
                            "batch"
                        ],
                        "description": "The intended purpose of the file"
                    }
@ -16622,131 +16367,6 @@
                ],
                "title": "RunEvalRequest"
            },
            "RunModerationRequest": {
                "type": "object",
                "properties": {
                    "input": {
                        "oneOf": [
                            {
                                "type": "string"
                            },
                            {
                                "type": "array",
                                "items": {
                                    "type": "string"
                                }
                            }
                        ],
                        "description": "Input (or inputs) to classify. Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models."
                    },
                    "model": {
                        "type": "string",
                        "description": "The content moderation model you would like to use."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "input",
                    "model"
                ],
                "title": "RunModerationRequest"
            },
            "ModerationObject": {
                "type": "object",
                "properties": {
                    "id": {
                        "type": "string",
                        "description": "The unique identifier for the moderation request."
                    },
                    "model": {
                        "type": "string",
                        "description": "The model used to generate the moderation results."
                    },
                    "results": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/ModerationObjectResults"
                        },
                        "description": "A list of moderation objects"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "id",
                    "model",
                    "results"
                ],
                "title": "ModerationObject",
                "description": "A moderation object."
            },
            "ModerationObjectResults": {
                "type": "object",
                "properties": {
                    "flagged": {
                        "type": "boolean",
                        "description": "Whether any of the below categories are flagged."
                    },
                    "categories": {
                        "type": "object",
                        "additionalProperties": {
                            "type": "boolean"
                        },
                        "description": "A list of the categories, and whether they are flagged or not."
                    },
                    "category_applied_input_types": {
                        "type": "object",
                        "additionalProperties": {
                            "type": "array",
                            "items": {
                                "type": "string"
                            }
                        },
                        "description": "A list of the categories along with the input type(s) that the score applies to."
                    },
                    "category_scores": {
                        "type": "object",
                        "additionalProperties": {
                            "type": "number"
                        },
                        "description": "A list of the categories along with their scores as predicted by model."
                    },
                    "user_message": {
                        "type": "string"
                    },
                    "metadata": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "type": "null"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "string"
                                },
                                {
                                    "type": "array"
                                },
                                {
                                    "type": "object"
                                }
                            ]
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "flagged",
                    "metadata"
                ],
                "title": "ModerationObjectResults",
                "description": "A moderation object."
            },
            "RunShieldRequest": {
                "type": "object",
                "properties": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -999,31 +999,6 @@ paths:
          required: true
          schema:
            type: string
    delete:
      responses:
        '200':
          description: OK
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Shields
      description: Unregister a shield.
      parameters:
        - name: identifier
          in: path
          description: >-
            The identifier of the shield to unregister.
          required: true
          schema:
            type: string
  /v1/telemetry/traces/{trace_id}/spans/{span_id}:
    get:
      responses:
@ -3358,36 +3333,6 @@ paths:
            schema:
              $ref: '#/components/schemas/RunEvalRequest'
        required: true
  /v1/openai/v1/moderations:
    post:
      responses:
        '200':
          description: A moderation object.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ModerationObject'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Safety
      description: >-
        Classifies if text and/or image inputs are potentially harmful.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/RunModerationRequest'
        required: true
  /v1/safety/run-shield:
    post:
      responses:
@ -6021,44 +5966,14 @@ components:
          type: array
          items:
            type: object
-            properties:
+            additionalProperties:
-              attributes:
+              oneOf:
-                type: object
+                - type: 'null'
-                additionalProperties:
+                - type: boolean
-                  oneOf:
+                - type: number
-                    - type: 'null'
+                - type: string
-                    - type: boolean
+                - type: array
-                    - type: number
+                - type: object
                    - type: string
                    - type: array
                    - type: object
                description: >-
                  (Optional) Key-value attributes associated with the file
              file_id:
                type: string
                description: >-
                  Unique identifier of the file containing the result
              filename:
                type: string
                description: Name of the file containing the result
              score:
                type: number
                description: >-
                  Relevance score for this search result (between 0 and 1)
              text:
                type: string
                description: Text content of the search result
            additionalProperties: false
            required:
              - attributes
              - file_id
              - filename
              - score
              - text
            title: >-
              OpenAIResponseOutputMessageFileSearchToolCallResults
            description: >-
              Search results returned by the file search operation.
          description: >-
            (Optional) Search results returned by the file search operation
      additionalProperties: false
@ -6218,12 +6133,6 @@ components:
          type: array
          items:
            $ref: '#/components/schemas/OpenAIResponseInputTool'
        include:
          type: array
          items:
            type: string
          description: >-
            (Optional) Additional fields to include in the response.
        max_infer_iters:
          type: integer
      additionalProperties: false
@ -6441,43 +6350,6 @@ components:
      title: OpenAIResponseOutputMessageMCPListTools
      description: >-
        MCP list tools output message containing available tools from an MCP server.
    OpenAIResponseContentPart:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseContentPartOutputText'
        - $ref: '#/components/schemas/OpenAIResponseContentPartRefusal'
      discriminator:
        propertyName: type
        mapping:
          output_text: '#/components/schemas/OpenAIResponseContentPartOutputText'
          refusal: '#/components/schemas/OpenAIResponseContentPartRefusal'
    OpenAIResponseContentPartOutputText:
      type: object
      properties:
        type:
          type: string
          const: output_text
          default: output_text
        text:
          type: string
      additionalProperties: false
      required:
        - type
        - text
      title: OpenAIResponseContentPartOutputText
    OpenAIResponseContentPartRefusal:
      type: object
      properties:
        type:
          type: string
          const: refusal
          default: refusal
        refusal:
          type: string
      additionalProperties: false
      required:
        - type
        - refusal
      title: OpenAIResponseContentPartRefusal
    OpenAIResponseObjectStream:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
@ -6498,8 +6370,6 @@ components:
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded'
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone'
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
      discriminator:
        propertyName: type
@ -6522,8 +6392,6 @@ components:
          response.mcp_call.in_progress: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
          response.mcp_call.failed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
          response.mcp_call.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
          response.content_part.added: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded'
          response.content_part.done: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone'
          response.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
    "OpenAIResponseObjectStreamResponseCompleted":
      type: object
@ -6545,76 +6413,6 @@ components:
        OpenAIResponseObjectStreamResponseCompleted
      description: >-
        Streaming event indicating a response has been completed.
    "OpenAIResponseObjectStreamResponseContentPartAdded":
      type: object
      properties:
        response_id:
          type: string
          description: >-
            Unique identifier of the response containing this content
        item_id:
          type: string
          description: >-
            Unique identifier of the output item containing this content part
        part:
          $ref: '#/components/schemas/OpenAIResponseContentPart'
          description: The content part that was added
        sequence_number:
          type: integer
          description: >-
            Sequential number for ordering streaming events
        type:
          type: string
          const: response.content_part.added
          default: response.content_part.added
          description: >-
            Event type identifier, always "response.content_part.added"
      additionalProperties: false
      required:
        - response_id
        - item_id
        - part
        - sequence_number
        - type
      title: >-
        OpenAIResponseObjectStreamResponseContentPartAdded
      description: >-
        Streaming event for when a new content part is added to a response item.
    "OpenAIResponseObjectStreamResponseContentPartDone":
      type: object
      properties:
        response_id:
          type: string
          description: >-
            Unique identifier of the response containing this content
        item_id:
          type: string
          description: >-
            Unique identifier of the output item containing this content part
        part:
          $ref: '#/components/schemas/OpenAIResponseContentPart'
          description: The completed content part
        sequence_number:
          type: integer
          description: >-
            Sequential number for ordering streaming events
        type:
          type: string
          const: response.content_part.done
          default: response.content_part.done
          description: >-
            Event type identifier, always "response.content_part.done"
      additionalProperties: false
      required:
        - response_id
        - item_id
        - part
        - sequence_number
        - type
      title: >-
        OpenAIResponseObjectStreamResponseContentPartDone
      description: >-
        Streaming event for when a content part is completed.
    "OpenAIResponseObjectStreamResponseCreated":
      type: object
      properties:
@ -10951,7 +10749,6 @@ components:
      type: string
      enum:
        - assistants
        - batch
      title: OpenAIFilePurpose
      description: >-
        Valid purpose values for OpenAI Files API.
@ -11020,7 +10817,6 @@ components:
          type: string
          enum:
            - assistants
            - batch
          description: The intended purpose of the file
      additionalProperties: false
      required:
@ -12363,96 +12159,6 @@ components:
      required:
        - benchmark_config
      title: RunEvalRequest
    RunModerationRequest:
      type: object
      properties:
        input:
          oneOf:
            - type: string
            - type: array
              items:
                type: string
          description: >-
            Input (or inputs) to classify. Can be a single string, an array of strings,
            or an array of multi-modal input objects similar to other models.
        model:
          type: string
          description: >-
            The content moderation model you would like to use.
      additionalProperties: false
      required:
        - input
        - model
      title: RunModerationRequest
    ModerationObject:
      type: object
      properties:
        id:
          type: string
          description: >-
            The unique identifier for the moderation request.
        model:
          type: string
          description: >-
            The model used to generate the moderation results.
        results:
          type: array
          items:
            $ref: '#/components/schemas/ModerationObjectResults'
          description: A list of moderation objects
      additionalProperties: false
      required:
        - id
        - model
        - results
      title: ModerationObject
      description: A moderation object.
    ModerationObjectResults:
      type: object
      properties:
        flagged:
          type: boolean
          description: >-
            Whether any of the below categories are flagged.
        categories:
          type: object
          additionalProperties:
            type: boolean
          description: >-
            A list of the categories, and whether they are flagged or not.
        category_applied_input_types:
          type: object
          additionalProperties:
            type: array
            items:
              type: string
          description: >-
            A list of the categories along with the input type(s) that the score applies
            to.
        category_scores:
          type: object
          additionalProperties:
            type: number
          description: >-
            A list of the categories along with their scores as predicted by model.
        user_message:
          type: string
        metadata:
          type: object
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
      additionalProperties: false
      required:
        - flagged
        - metadata
      title: ModerationObjectResults
      description: A moderation object.
    RunShieldRequest:
      type: object
      properties:
--- a/docs/source/apis/external.md
+++ b/docs/source/apis/external.md
@ -111,7 +111,7 @@ name = "llama-stack-api-weather"
 version = "0.1.0"
 description = "Weather API for Llama Stack"
 readme = "README.md"
-requires-python = ">=3.12"
+requires-python = ">=3.10"
 dependencies = ["llama-stack", "pydantic"]
 [build-system]
@ -231,7 +231,7 @@ name = "llama-stack-provider-kaze"
 version = "0.1.0"
 description = "Kaze weather provider for Llama Stack"
 readme = "README.md"
-requires-python = ">=3.12"
+requires-python = ">=3.10"
 dependencies = ["llama-stack", "pydantic", "aiohttp"]
 [build-system]
--- a/docs/source/building_applications/responses_vs_agents.md
+++ b/docs/source/building_applications/responses_vs_agents.md
@ -2,9 +2,7 @@
 Llama Stack (LLS) provides two different APIs for building AI applications with tool calling capabilities: the **Agents API** and the **OpenAI Responses API**. While both enable AI systems to use tools, and maintain full conversation history, they serve different use cases and have distinct characteristics.
-```{note}
+> **Note:** For simple and basic inferencing, you may want to use the [Chat Completions API](https://llama-stack.readthedocs.io/en/latest/providers/index.html#chat-completions) directly, before progressing to Agents or Responses API.
 For simple and basic inferencing, you may want to use the [Chat Completions API](https://llama-stack.readthedocs.io/en/latest/providers/index.html#chat-completions) directly, before progressing to Agents or Responses API.
 ```
 ## Overview
--- a/docs/source/building_applications/tools.md
+++ b/docs/source/building_applications/tools.md
@ -76,9 +76,7 @@ Features:
 - Context retrieval with token limits
-```{note}
+> **Note:** By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
 By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
 ```
 ## Model Context Protocol (MCP)
--- a/docs/source/concepts/apis.md
+++ b/docs/source/concepts/apis.md
@ -18,4 +18,3 @@ We are working on adding a few more APIs to complete the application lifecycle.
 - **Batch Inference**: run inference on a dataset of inputs
 - **Batch Agents**: run agents on a dataset of inputs
 - **Synthetic Data Generation**: generate synthetic data for model development
 - **Batches**: OpenAI-compatible batch management for inference
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -131,7 +131,6 @@ html_static_path = ["../_static"]
 def setup(app):
    app.add_css_file("css/my_theme.css")
    app.add_js_file("js/detect_theme.js")
    app.add_js_file("js/keyboard_shortcuts.js")
    def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
        url = f"https://hub.docker.com/r/llamastack/{text}"
--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@ -2,38 +2,14 @@
 ```{include} ../../../CONTRIBUTING.md
 ```
-## Adding a New Provider
+See the [Adding a New API Provider](new_api_provider.md) which describes how to add new API providers to the Stack.
 See:
 - [Adding a New API Provider Page](new_api_provider.md) which describes how to add new API providers to the Stack.
 - [Vector Database Page](new_vector_database.md) which describes how to add a new vector databases with Llama Stack.
 - [External Provider Page](../providers/external/index.md) which describes how to add external providers to the Stack.
 ```{toctree}
 :maxdepth: 1
 :hidden:
 new_api_provider
-new_vector_database
+testing
 ```
 ## Testing
 ```{include} ../../../tests/README.md
 ```
 ## Advanced Topics
 For developers who need deeper understanding of the testing system internals:
 ```{toctree}
 :maxdepth: 1
 testing/record-replay
 ```
 ### Benchmarking
 ```{include} ../../../docs/source/distributions/k8s-benchmark/README.md
 ```
--- a/docs/source/contributing/new_vector_database.md
+++ b/docs/source/contributing/new_vector_database.md
@ -1,75 +0,0 @@
 # Adding a New Vector Database
 This guide will walk you through the process of adding a new vector database to Llama Stack.
 > **_NOTE:_** Here's an example Pull Request of the [Milvus Vector Database Provider](https://github.com/meta-llama/llama-stack/pull/1467).
 Vector Database providers are used to store and retrieve vector embeddings. Vector databases are not limited to vector
 search but can support keyword and hybrid search. Additionally, vector database can also support operations like
 filtering, sorting, and aggregating vectors.
 ## Steps to Add a New Vector Database Provider
 1. **Choose the Database Type**: Determine if your vector database is a remote service, inline, or both.
   - Remote databases make requests to external services, while inline databases execute locally. Some providers support both.
 2. **Implement the Provider**: Create a new provider class that inherits from `VectorDatabaseProvider` and implements the required methods.
   - Implement methods for vector storage, retrieval, search, and any additional features your database supports.
     - You will need to implement the following methods for `YourVectorIndex`:
        - `YourVectorIndex.create()`
        - `YourVectorIndex.initialize()`
        - `YourVectorIndex.add_chunks()`
        - `YourVectorIndex.delete_chunk()`
        - `YourVectorIndex.query_vector()`
        - `YourVectorIndex.query_keyword()`
        - `YourVectorIndex.query_hybrid()`
     - You will need to implement the following methods for `YourVectorIOAdapter`:
        - `YourVectorIOAdapter.initialize()`
        - `YourVectorIOAdapter.shutdown()`
        - `YourVectorIOAdapter.list_vector_dbs()`
        - `YourVectorIOAdapter.register_vector_db()`
        - `YourVectorIOAdapter.unregister_vector_db()`
        - `YourVectorIOAdapter.insert_chunks()`
        - `YourVectorIOAdapter.query_chunks()`
        - `YourVectorIOAdapter.delete_chunks()`
 3. **Add to Registry**: Register your provider in the appropriate registry file.
   - Update {repopath}`llama_stack/providers/registry/vector_io.py` to include your new provider.
 ```python
 from llama_stack.providers.registry.specs import InlineProviderSpec
 from llama_stack.providers.registry.api import Api
 InlineProviderSpec(
    api=Api.vector_io,
    provider_type="inline::milvus",
    pip_packages=["pymilvus>=2.4.10"],
    module="llama_stack.providers.inline.vector_io.milvus",
    config_class="llama_stack.providers.inline.vector_io.milvus.MilvusVectorIOConfig",
    api_dependencies=[Api.inference],
    optional_api_dependencies=[Api.files],
    description="",
 ),
 ```
 4. **Add Tests**: Create unit tests and integration tests for your provider in the `tests/` directory.
   - Unit Tests
     - By following the structure of the class methods, you will be able to easily run unit and integration tests for your database.
       1. You have to configure the tests for your provide in `/tests/unit/providers/vector_io/conftest.py`.
       2. Update the `vector_provider` fixture to include your provider if they are an inline provider.
       3. Create a `your_vectorprovider_index` fixture that initializes your vector index.
       4. Create a `your_vectorprovider_adapter` fixture that initializes your vector adapter.
       5. Add your provider to the `vector_io_providers` fixture dictionary.
         - Please follow the naming convention of `your_vectorprovider_index` and `your_vectorprovider_adapter` as the tests require this to execute properly.
   - Integration Tests
     - Integration tests are located in {repopath}`tests/integration`. These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality.
     - The two set of integration tests are:
       - `tests/integration/vector_io/test_vector_io.py`: This file tests registration, insertion, and retrieval.
       - `tests/integration/vector_io/test_openai_vector_stores.py`: These tests are for OpenAI-compatible vector stores and test the OpenAI API compatibility.
        - You will need to update `skip_if_provider_doesnt_support_openai_vector_stores` to include your provider as well as `skip_if_provider_doesnt_support_openai_vector_stores_search` to test the appropriate search functionality.
     - Running the tests in the GitHub CI
       - You will need to update the `.github/workflows/integration-vector-io-tests.yml` file to include your provider.
        - If your provider is a remote provider, you will also have to add a container to spin up and run it in the action.
   - Updating the pyproject.yml
     - If you are adding tests for the `inline` provider you will have to update the `unit` group.
       - `uv add new_pip_package --group unit`
     - If you are adding tests for the `remote` provider you will have to update the `test` group, which is used in the GitHub CI for integration tests.
       - `uv add new_pip_package --group test`
 5. **Update Documentation**: Please update the documentation for end users
   - Generate the provider documentation by running {repopath}`./scripts/provider_codegen.py`.
   - Update the autogenerated content in the registry/vector_io.py file with information about your provider. Please see other providers for examples.
--- a/docs/source/contributing/testing.md
+++ b/docs/source/contributing/testing.md
@ -0,0 +1,6 @@
 # Testing Llama Stack
 Tests are of three different kinds:
 - Unit tests
 - Provider focused integration tests
 - Client SDK tests
--- a/docs/source/contributing/testing/record-replay.md
+++ b/docs/source/contributing/testing/record-replay.md
@ -1,234 +0,0 @@
 # Record-Replay System
 Understanding how Llama Stack captures and replays API interactions for testing.
 ## Overview
 The record-replay system solves a fundamental challenge in AI testing: how do you test against expensive, non-deterministic APIs without breaking the bank or dealing with flaky tests?
 The solution: intercept API calls, store real responses, and replay them later. This gives you real API behavior without the cost or variability.
 ## How It Works
 ### Request Hashing
 Every API request gets converted to a deterministic hash for lookup:
 ```python
 def normalize_request(method: str, url: str, headers: dict, body: dict) -> str:
    normalized = {
        "method": method.upper(),
        "endpoint": urlparse(url).path,  # Just the path, not full URL
        "body": body,  # Request parameters
    }
    return hashlib.sha256(json.dumps(normalized, sort_keys=True).encode()).hexdigest()
 ```
 **Key insight:** The hashing is intentionally precise. Different whitespace, float precision, or parameter order produces different hashes. This prevents subtle bugs from false cache hits.
 ```python
 # These produce DIFFERENT hashes:
 {"content": "Hello world"}
 {"content": "Hello   world\n"}
 {"temperature": 0.7}
 {"temperature": 0.7000001}
 ```
 ### Client Interception
 The system patches OpenAI and Ollama client methods to intercept calls before they leave your application. This happens transparently - your test code doesn't change.
 ### Storage Architecture
 Recordings use a two-tier storage system optimized for both speed and debuggability:
 ```
 recordings/
 ├── index.sqlite          # Fast lookup by request hash
 └── responses/
    ├── abc123def456.json  # Individual response files
    └── def789ghi012.json
 ```
 **SQLite index** enables O(log n) hash lookups and metadata queries without loading response bodies.
 **JSON files** store complete request/response pairs in human-readable format for debugging.
 ## Recording Modes
 ### LIVE Mode
 Direct API calls with no recording or replay:
 ```python
 with inference_recording(mode=InferenceMode.LIVE):
    response = await client.chat.completions.create(...)
 ```
 Use for initial development and debugging against real APIs.
 ### RECORD Mode
 Captures API interactions while passing through real responses:
 ```python
 with inference_recording(mode=InferenceMode.RECORD, storage_dir="./recordings"):
    response = await client.chat.completions.create(...)
    # Real API call made, response captured AND returned
 ```
 The recording process:
 1. Request intercepted and hashed
 2. Real API call executed
 3. Response captured and serialized
 4. Recording stored to disk
 5. Original response returned to caller
 ### REPLAY Mode
 Returns stored responses instead of making API calls:
 ```python
 with inference_recording(mode=InferenceMode.REPLAY, storage_dir="./recordings"):
    response = await client.chat.completions.create(...)
    # No API call made, cached response returned instantly
 ```
 The replay process:
 1. Request intercepted and hashed
 2. Hash looked up in SQLite index
 3. Response loaded from JSON file
 4. Response deserialized and returned
 5. Error if no recording found
 ## Streaming Support
 Streaming APIs present a unique challenge: how do you capture an async generator?
 ### The Problem
 ```python
 # How do you record this?
 async for chunk in client.chat.completions.create(stream=True):
    process(chunk)
 ```
 ### The Solution
 The system captures all chunks immediately before yielding any:
 ```python
 async def handle_streaming_record(response):
    # Capture complete stream first
    chunks = []
    async for chunk in response:
        chunks.append(chunk)
    # Store complete recording
    storage.store_recording(
        request_hash, request_data, {"body": chunks, "is_streaming": True}
    )
    # Return generator that replays captured chunks
    async def replay_stream():
        for chunk in chunks:
            yield chunk
    return replay_stream()
 ```
 This ensures:
 - **Complete capture** - The entire stream is saved atomically
 - **Interface preservation** - The returned object behaves like the original API
 - **Deterministic replay** - Same chunks in the same order every time
 ## Serialization
 API responses contain complex Pydantic objects that need careful serialization:
 ```python
 def _serialize_response(response):
    if hasattr(response, "model_dump"):
        # Preserve type information for proper deserialization
        return {
            "__type__": f"{response.__class__.__module__}.{response.__class__.__qualname__}",
            "__data__": response.model_dump(mode="json"),
        }
    return response
 ```
 This preserves type safety - when replayed, you get the same Pydantic objects with all their validation and methods.
 ## Environment Integration
 ### Environment Variables
 Control recording behavior globally:
 ```bash
 export LLAMA_STACK_TEST_INFERENCE_MODE=replay
 export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings
 pytest tests/integration/
 ```
 ### Pytest Integration
 The system integrates automatically based on environment variables, requiring no changes to test code.
 ## Debugging Recordings
 ### Inspecting Storage
 ```bash
 # See what's recorded
 sqlite3 recordings/index.sqlite "SELECT endpoint, model, timestamp FROM recordings LIMIT 10;"
 # View specific response
 cat recordings/responses/abc123def456.json | jq '.response.body'
 # Find recordings by endpoint
 sqlite3 recordings/index.sqlite "SELECT * FROM recordings WHERE endpoint='/v1/chat/completions';"
 ```
 ### Common Issues
 **Hash mismatches:** Request parameters changed slightly between record and replay
 ```bash
 # Compare request details
 cat recordings/responses/abc123.json | jq '.request'
 ```
 **Serialization errors:** Response types changed between versions
 ```bash
 # Re-record with updated types
 rm recordings/responses/failing_hash.json
 LLAMA_STACK_TEST_INFERENCE_MODE=record pytest test_failing.py
 ```
 **Missing recordings:** New test or changed parameters
 ```bash
 # Record the missing interaction
 LLAMA_STACK_TEST_INFERENCE_MODE=record pytest test_new.py
 ```
 ## Design Decisions
 ### Why Not Mocks?
 Traditional mocking breaks down with AI APIs because:
 - Response structures are complex and evolve frequently
 - Streaming behavior is hard to mock correctly
 - Edge cases in real APIs get missed
 - Mocks become brittle maintenance burdens
 ### Why Precise Hashing?
 Loose hashing (normalizing whitespace, rounding floats) seems convenient but hides bugs. If a test changes slightly, you want to know about it rather than accidentally getting the wrong cached response.
 ### Why JSON + SQLite?
 - **JSON** - Human readable, diff-friendly, easy to inspect and modify
 - **SQLite** - Fast indexed lookups without loading response bodies
 - **Hybrid** - Best of both worlds for different use cases
 This system provides reliable, fast testing against real AI APIs while maintaining the ability to debug issues when they arise.
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -53,31 +53,24 @@ The main points to consider are:
 ```
 llama stack build -h
-usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--distro DISTRIBUTION] [--list-distros] [--image-type {container,venv}] [--image-name IMAGE_NAME] [--print-deps-only]
+usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--list-templates] [--image-type {container,venv}] [--image-name IMAGE_NAME] [--print-deps-only] [--run]
                         [--run] [--providers PROVIDERS]
 Build a Llama stack container
 options:
  -h, --help            show this help message and exit
-  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will be prompted to
+  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will
-                        enter information interactively (default: None)
+                        be prompted to enter information interactively (default: None)
-  --template TEMPLATE   (deprecated) Name of the example template config to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default:
+  --template TEMPLATE   Name of the example template config to use for build. You may use `llama stack build --list-templates` to check out the available templates (default: None)
-                        None)
+  --list-templates      Show the available templates for building a Llama Stack distribution (default: False)
  --distro DISTRIBUTION, --distribution DISTRIBUTION
                        Name of the distribution to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default: None)
  --list-distros, --list-distributions
                        Show the available distributions for building a Llama Stack distribution (default: False)
  --image-type {container,venv}
                        Image Type to use for the build. If not specified, will use the image type from the template config. (default: None)
  --image-name IMAGE_NAME
-                        [for image-type=container|venv] Name of the virtual environment to use for the build. If not specified, currently active environment will be used if found. (default:
+                        [for image-type=container|venv] Name of the virtual environment to use for the build. If not specified, currently active environment will be used if
-                        None)
+                        found. (default: None)
  --print-deps-only     Print the dependencies for the stack only, without building the stack (default: False)
  --run                 Run the stack after building using the same image type, name, and other applicable arguments (default: False)
-  --providers PROVIDERS
+
                        Build a config for a list of providers and only those providers. This list is formatted like: api1=provider1,api2=provider2. Where there can be multiple providers per
                        API. (default: None)
 ```
 After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.
--- a/docs/source/distributions/k8s-benchmark/README.md
+++ b/docs/source/distributions/k8s-benchmark/README.md
@ -1,156 +0,0 @@
 # Llama Stack Benchmark Suite on Kubernetes
 ## Motivation
 Performance benchmarking is critical for understanding the overhead and characteristics of the Llama Stack abstraction layer compared to direct inference engines like vLLM.
 ### Why This Benchmark Suite Exists
 **Performance Validation**: The Llama Stack provides a unified API layer across multiple inference providers, but this abstraction introduces potential overhead. This benchmark suite quantifies the performance impact by comparing:
 - Llama Stack inference (with vLLM backend)
 - Direct vLLM inference calls
 - Both under identical Kubernetes deployment conditions
 **Production Readiness Assessment**: Real-world deployments require understanding performance characteristics under load. This suite simulates concurrent user scenarios with configurable parameters (duration, concurrency, request patterns) to validate production readiness.
 **Regression Detection (TODO)**: As the Llama Stack evolves, this benchmark provides automated regression detection for performance changes. CI/CD pipelines can leverage these benchmarks to catch performance degradations before production deployments.
 **Resource Planning**: By measuring throughput, latency percentiles, and resource utilization patterns, teams can make informed decisions about:
 - Kubernetes resource allocation (CPU, memory, GPU)
 - Auto-scaling configurations
 - Cost optimization strategies
 ### Key Metrics Captured
 The benchmark suite measures critical performance indicators:
 - **Throughput**: Requests per second under sustained load
 - **Latency Distribution**: P50, P95, P99 response times
 - **Time to First Token (TTFT)**: Critical for streaming applications
 - **Error Rates**: Request failures and timeout analysis
 This data enables data-driven architectural decisions and performance optimization efforts.
 ## Setup
 **1. Deploy base k8s infrastructure:**
 ```bash
 cd ../k8s
 ./apply.sh
 ```
 **2. Deploy benchmark components:**
 ```bash
 cd ../k8s-benchmark
 ./apply.sh
 ```
 **3. Verify deployment:**
 ```bash
 kubectl get pods
 # Should see: llama-stack-benchmark-server, vllm-server, etc.
 ```
 ## Quick Start
 ### Basic Benchmarks
 **Benchmark Llama Stack (default):**
 ```bash
 cd docs/source/distributions/k8s-benchmark/
 ./run-benchmark.sh
 ```
 **Benchmark vLLM direct:**
 ```bash
 ./run-benchmark.sh --target vllm
 ```
 ### Custom Configuration
 **Extended benchmark with high concurrency:**
 ```bash
 ./run-benchmark.sh --target vllm --duration 120 --concurrent 20
 ```
 **Short test run:**
 ```bash
 ./run-benchmark.sh --target stack --duration 30 --concurrent 5
 ```
 ## Command Reference
 ### run-benchmark.sh Options
 ```bash
 ./run-benchmark.sh [options]
 Options:
  -t, --target <stack|vllm>     Target to benchmark (default: stack)
  -d, --duration <seconds>      Duration in seconds (default: 60)
  -c, --concurrent <users>      Number of concurrent users (default: 10)
  -h, --help                    Show help message
 Examples:
  ./run-benchmark.sh --target vllm              # Benchmark vLLM direct
  ./run-benchmark.sh --target stack             # Benchmark Llama Stack
  ./run-benchmark.sh -t vllm -d 120 -c 20       # vLLM with 120s, 20 users
 ```
 ## Local Testing
 ### Running Benchmark Locally
 For local development without Kubernetes:
 **1. Start OpenAI mock server:**
 ```bash
 uv run python openai-mock-server.py --port 8080
 ```
 **2. Run benchmark against mock server:**
 ```bash
 uv run python benchmark.py \
  --base-url http://localhost:8080/v1 \
  --model mock-inference \
  --duration 30 \
  --concurrent 5
 ```
 **3. Test against local vLLM server:**
 ```bash
 # If you have vLLM running locally on port 8000
 uv run python benchmark.py \
  --base-url http://localhost:8000/v1 \
  --model meta-llama/Llama-3.2-3B-Instruct \
  --duration 30 \
  --concurrent 5
 ```
 **4. Profile the running server:**
 ```bash
 ./profile_running_server.sh
 ```
 ### OpenAI Mock Server
 The `openai-mock-server.py` provides:
 - **OpenAI-compatible API** for testing without real models
 - **Configurable streaming delay** via `STREAM_DELAY_SECONDS` env var
 - **Consistent responses** for reproducible benchmarks
 - **Lightweight testing** without GPU requirements
 **Mock server usage:**
 ```bash
 uv run python openai-mock-server.py --port 8080
 ```
 The mock server is also deployed in k8s as `openai-mock-service:8080` and can be used by changing the Llama Stack configuration to use the `mock-vllm-inference` provider.
 ## Files in this Directory
 - `benchmark.py` - Core benchmark script with async streaming support
 - `run-benchmark.sh` - Main script with target selection and configuration
 - `openai-mock-server.py` - Mock OpenAI API server for local testing
 - `README.md` - This documentation file
--- a/docs/source/distributions/k8s-benchmark/apply.sh
+++ b/docs/source/distributions/k8s-benchmark/apply.sh
@ -1,36 +0,0 @@
 #!/usr/bin/env bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Deploys the benchmark-specific components on top of the base k8s deployment (../k8s/apply.sh).
 export STREAM_DELAY_SECONDS=0.005
 export POSTGRES_USER=llamastack
 export POSTGRES_DB=llamastack
 export POSTGRES_PASSWORD=llamastack
 export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 export MOCK_INFERENCE_MODEL=mock-inference
 export MOCK_INFERENCE_URL=openai-mock-service:8080
 export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
 set -euo pipefail
 set -x
 # Deploy benchmark-specific components
 kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
  --dry-run=client -o yaml > stack-configmap.yaml
 kubectl apply --validate=false -f stack-configmap.yaml
 # Deploy our custom llama stack server (overriding the base one)
 envsubst < stack-k8s.yaml.template | kubectl apply --validate=false -f -
--- a/docs/source/distributions/k8s-benchmark/benchmark.py
+++ b/docs/source/distributions/k8s-benchmark/benchmark.py
@ -1,267 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 """
 Simple benchmark script for Llama Stack with OpenAI API compatibility.
 """
 import argparse
 import asyncio
 import os
 import random
 import statistics
 import time
 from typing import Tuple
 import aiohttp
 class BenchmarkStats:
    def __init__(self):
        self.response_times = []
        self.ttft_times = []
        self.chunks_received = []
        self.errors = []
        self.success_count = 0
        self.total_requests = 0
        self.concurrent_users = 0
        self.start_time = None
        self.end_time = None
        self._lock = asyncio.Lock()
    async def add_result(self, response_time: float, chunks: int, ttft: float = None, error: str = None):
        async with self._lock:
            self.total_requests += 1
            if error:
                self.errors.append(error)
            else:
                self.success_count += 1
                self.response_times.append(response_time)
                self.chunks_received.append(chunks)
                if ttft is not None:
                    self.ttft_times.append(ttft)
    def print_summary(self):
        if not self.response_times:
            print("No successful requests to report")
            if self.errors:
                print(f"Total errors: {len(self.errors)}")
                print("First 5 errors:")
                for error in self.errors[:5]:
                    print(f"  {error}")
            return
        total_time = self.end_time - self.start_time
        success_rate = (self.success_count / self.total_requests) * 100
        print(f"\n{'='*60}")
        print(f"BENCHMARK RESULTS")
        print(f"{'='*60}")
        print(f"Total time: {total_time:.2f}s")
        print(f"Concurrent users: {self.concurrent_users}")
        print(f"Total requests: {self.total_requests}")
        print(f"Successful requests: {self.success_count}")
        print(f"Failed requests: {len(self.errors)}")
        print(f"Success rate: {success_rate:.1f}%")
        print(f"Requests per second: {self.success_count / total_time:.2f}")
        print(f"\nResponse Time Statistics:")
        print(f"  Mean: {statistics.mean(self.response_times):.3f}s")
        print(f"  Median: {statistics.median(self.response_times):.3f}s")
        print(f"  Min: {min(self.response_times):.3f}s")
        print(f"  Max: {max(self.response_times):.3f}s")
        if len(self.response_times) > 1:
            print(f"  Std Dev: {statistics.stdev(self.response_times):.3f}s")
        percentiles = [50, 90, 95, 99]
        sorted_times = sorted(self.response_times)
        print(f"\nPercentiles:")
        for p in percentiles:
            idx = int(len(sorted_times) * p / 100) - 1
            idx = max(0, min(idx, len(sorted_times) - 1))
            print(f"  P{p}: {sorted_times[idx]:.3f}s")
        if self.ttft_times:
            print(f"\nTime to First Token (TTFT) Statistics:")
            print(f"  Mean: {statistics.mean(self.ttft_times):.3f}s")
            print(f"  Median: {statistics.median(self.ttft_times):.3f}s")
            print(f"  Min: {min(self.ttft_times):.3f}s")
            print(f"  Max: {max(self.ttft_times):.3f}s")
            if len(self.ttft_times) > 1:
                print(f"  Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
            sorted_ttft = sorted(self.ttft_times)
            print(f"\nTTFT Percentiles:")
            for p in percentiles:
                idx = int(len(sorted_ttft) * p / 100) - 1
                idx = max(0, min(idx, len(sorted_ttft) - 1))
                print(f"  P{p}: {sorted_ttft[idx]:.3f}s")
        if self.chunks_received:
            print(f"\nStreaming Statistics:")
            print(f"  Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
            print(f"  Total chunks received: {sum(self.chunks_received)}")
        if self.errors:
            print(f"\nErrors (showing first 5):")
            for error in self.errors[:5]:
                print(f"  {error}")
 class LlamaStackBenchmark:
    def __init__(self, base_url: str, model_id: str):
        self.base_url = base_url.rstrip('/')
        self.model_id = model_id
        self.headers = {"Content-Type": "application/json"}
        self.test_messages = [
            [{"role": "user", "content": "Hi"}],
            [{"role": "user", "content": "What is the capital of France?"}],
            [{"role": "user", "content": "Explain quantum physics in simple terms."}],
            [{"role": "user", "content": "Write a short story about a robot learning to paint."}],
            [
                {"role": "user", "content": "What is machine learning?"},
                {"role": "assistant", "content": "Machine learning is a subset of AI..."},
                {"role": "user", "content": "Can you give me a practical example?"}
            ]
        ]
    async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
        """Make a single async streaming chat completion request."""
        messages = random.choice(self.test_messages)
        payload = {
            "model": self.model_id,
            "messages": messages,
            "stream": True,
            "max_tokens": 100
        }
        start_time = time.time()
        chunks_received = 0
        ttft = None
        error = None
        session = aiohttp.ClientSession()
        try:
            async with session.post(
                f"{self.base_url}/chat/completions",
                headers=self.headers,
                json=payload,
                timeout=aiohttp.ClientTimeout(total=30)
            ) as response:
                if response.status == 200:
                    async for line in response.content:
                        if line:
                            line_str = line.decode('utf-8').strip()
                            if line_str.startswith('data: '):
                                chunks_received += 1
                                if ttft is None:
                                    ttft = time.time() - start_time
                                if line_str == 'data: [DONE]':
                                    break
                    if chunks_received == 0:
                        error = "No streaming chunks received"
                else:
                    text = await response.text()
                    error = f"HTTP {response.status}: {text[:100]}"
        except Exception as e:
            error = f"Request error: {str(e)}"
        finally:
            await session.close()
        response_time = time.time() - start_time
        return response_time, chunks_received, ttft, error
    async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
        """Run benchmark using async requests for specified duration."""
        stats = BenchmarkStats()
        stats.concurrent_users = concurrent_users
        stats.start_time = time.time()
        print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users")
        print(f"Target URL: {self.base_url}/chat/completions")
        print(f"Model: {self.model_id}")
        connector = aiohttp.TCPConnector(limit=concurrent_users)
        async with aiohttp.ClientSession(connector=connector) as session:
            async def worker(worker_id: int):
                """Worker that sends requests sequentially until canceled."""
                request_count = 0
                while True:
                    try:
                        response_time, chunks, ttft, error = await self.make_async_streaming_request()
                        await stats.add_result(response_time, chunks, ttft, error)
                        request_count += 1
                    except asyncio.CancelledError:
                        break
                    except Exception as e:
                        await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}")
            # Progress reporting task
            async def progress_reporter():
                last_report_time = time.time()
                while True:
                    try:
                        await asyncio.sleep(1)  # Report every second
                        if time.time() >= last_report_time + 10:  # Report every 10 seconds
                            elapsed = time.time() - stats.start_time
                            print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s")
                            last_report_time = time.time()
                    except asyncio.CancelledError:
                        break
            # Spawn concurrent workers
            tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)]
            progress_task = asyncio.create_task(progress_reporter())
            tasks.append(progress_task)
            # Wait for duration then cancel all tasks
            await asyncio.sleep(duration)
            for task in tasks:
                task.cancel()
            # Wait for all tasks to complete
            await asyncio.gather(*tasks, return_exceptions=True)
        stats.end_time = time.time()
        return stats
 def main():
    parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
    parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
                       help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)")
    parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"),
                       help="Model ID to use for requests")
    parser.add_argument("--duration", type=int, default=60,
                       help="Duration in seconds to run benchmark (default: 60)")
    parser.add_argument("--concurrent", type=int, default=10,
                       help="Number of concurrent users (default: 10)")
    args = parser.parse_args()
    benchmark = LlamaStackBenchmark(args.base_url, args.model)
    try:
        stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent))
        stats.print_summary()
    except KeyboardInterrupt:
        print("\nBenchmark interrupted by user")
    except Exception as e:
        print(f"Benchmark failed: {e}")
 if __name__ == "__main__":
    main()
--- a/docs/source/distributions/k8s-benchmark/openai-mock-server.py
+++ b/docs/source/distributions/k8s-benchmark/openai-mock-server.py
@ -1,190 +0,0 @@
 #!/usr/bin/env python3
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 """
 OpenAI-compatible mock server that returns:
 - Hardcoded /models response for consistent validation
 - Valid OpenAI-formatted chat completion responses with dynamic content
 """
 from flask import Flask, request, jsonify, Response
 import time
 import random
 import uuid
 import json
 import argparse
 import os
 app = Flask(__name__)
 # Models from environment variables
 def get_models():
    models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
    model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
    return {
        "object": "list",
        "data": [
            {
                "id": model_id,
                "object": "model",
                "created": 1234567890,
                "owned_by": "vllm"
            }
            for model_id in model_ids
        ]
    }
 def generate_random_text(length=50):
    """Generate random but coherent text for responses."""
    words = [
        "Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you",
        "with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what",
        "you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist",
        "with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more"
    ]
    return " ".join(random.choices(words, k=length))
@app.route('/v1/models', methods=['GET'])
 def list_models():
    models = get_models()
    print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
    return jsonify(models)
@app.route('/v1/chat/completions', methods=['POST'])
 def chat_completions():
    """Return OpenAI-formatted chat completion responses."""
    data = request.get_json()
    default_model = get_models()['data'][0]['id']
    model = data.get('model', default_model)
    messages = data.get('messages', [])
    stream = data.get('stream', False)
    print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
    if stream:
        return handle_streaming_completion(model, messages)
    else:
        return handle_non_streaming_completion(model, messages)
 def handle_non_streaming_completion(model, messages):
    response_text = generate_random_text(random.randint(20, 80))
    # Calculate realistic token counts
    prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages)
    completion_tokens = len(response_text.split())
    response = {
        "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
        "object": "chat.completion",
        "created": int(time.time()),
        "model": model,
        "choices": [
            {
                "index": 0,
                "message": {
                    "role": "assistant",
                    "content": response_text
                },
                "finish_reason": "stop"
            }
        ],
        "usage": {
            "prompt_tokens": prompt_tokens,
            "completion_tokens": completion_tokens,
            "total_tokens": prompt_tokens + completion_tokens
        }
    }
    return jsonify(response)
 def handle_streaming_completion(model, messages):
    def generate_stream():
        # Generate response text
        full_response = generate_random_text(random.randint(30, 100))
        words = full_response.split()
        # Send initial chunk
        initial_chunk = {
            "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
            "object": "chat.completion.chunk",
            "created": int(time.time()),
            "model": model,
            "choices": [
                {
                    "index": 0,
                    "delta": {"role": "assistant", "content": ""}
                }
            ]
        }
        yield f"data: {json.dumps(initial_chunk)}\n\n"
        # Send word by word
        for i, word in enumerate(words):
            chunk = {
                "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
                "object": "chat.completion.chunk", 
                "created": int(time.time()),
                "model": model,
                "choices": [
                    {
                        "index": 0,
                        "delta": {"content": f"{word} " if i < len(words) - 1 else word}
                    }
                ]
            }
            yield f"data: {json.dumps(chunk)}\n\n"
            # Configurable delay to simulate realistic streaming
            stream_delay = float(os.getenv("STREAM_DELAY_SECONDS", "0.005"))
            time.sleep(stream_delay)
        # Send final chunk
        final_chunk = {
            "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
            "object": "chat.completion.chunk",
            "created": int(time.time()),
            "model": model,
            "choices": [
                {
                    "index": 0,
                    "delta": {"content": ""},
                    "finish_reason": "stop"
                }
            ]
        }
        yield f"data: {json.dumps(final_chunk)}\n\n"
        yield "data: [DONE]\n\n"
    return Response(
        generate_stream(),
        mimetype='text/event-stream',
        headers={
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Access-Control-Allow-Origin': '*',
        }
    )
@app.route('/health', methods=['GET'])
 def health():
    return jsonify({"status": "healthy", "type": "openai-mock"})
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='OpenAI-compatible mock server')
    parser.add_argument('--port', type=int, default=8081, 
                       help='Port to run the server on (default: 8081)')
    args = parser.parse_args()
    port = args.port
    models = get_models()
    print("Starting OpenAI-compatible mock server...")
    print(f"- /models endpoint with: {[m['id'] for m in models['data']]}")
    print("- OpenAI-formatted chat/completion responses with dynamic content")
    print("- Streaming support with valid SSE format")
    print(f"- Listening on: http://0.0.0.0:{port}")
    app.run(host='0.0.0.0', port=port, debug=False)
--- a/docs/source/distributions/k8s-benchmark/profile_running_server.sh
+++ b/docs/source/distributions/k8s-benchmark/profile_running_server.sh
@ -1,52 +0,0 @@
 #!/bin/bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Script to profile an already running Llama Stack server
 # Usage: ./profile_running_server.sh [duration_seconds] [output_file]
 DURATION=${1:-60}  # Default 60 seconds
 OUTPUT_FILE=${2:-"llama_stack_profile"}  # Default output file
 echo "Looking for running Llama Stack server..."
 # Find the server PID
 SERVER_PID=$(ps aux | grep "llama_stack.core.server.server" | grep -v grep | awk '{print $2}' | head -1)
 if [ -z "$SERVER_PID" ]; then
    echo "Error: No running Llama Stack server found"
    echo "Please start your server first with:"
    echo "LLAMA_STACK_LOGGING=\"all=ERROR\" MOCK_INFERENCE_URL=http://localhost:8080 SAFETY_MODEL=llama-guard3:1b uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml"
    exit 1
 fi
 echo "Found Llama Stack server with PID: $SERVER_PID"
 # Start py-spy profiling
 echo "Starting py-spy profiling for ${DURATION} seconds..."
 echo "Output will be saved to: ${OUTPUT_FILE}.svg"
 echo ""
 echo "You can now run your load test..."
 echo ""
 # Get the full path to py-spy
 PYSPY_PATH=$(which py-spy)
 # Check if running as root, if not, use sudo
 if [ "$EUID" -ne 0 ]; then
    echo "py-spy requires root permissions on macOS. Running with sudo..."
    sudo "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
 else
    "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
 fi
 echo ""
 echo "Profiling completed! Results saved to: ${OUTPUT_FILE}.svg"
 echo ""
 echo "To view the flame graph:"
 echo "open ${OUTPUT_FILE}.svg"
--- a/docs/source/distributions/k8s-benchmark/run-benchmark.sh
+++ b/docs/source/distributions/k8s-benchmark/run-benchmark.sh
@ -1,148 +0,0 @@
 #!/usr/bin/env bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 set -euo pipefail
 # Default values
 TARGET="stack"
 DURATION=60
 CONCURRENT=10
 # Parse command line arguments
 usage() {
    echo "Usage: $0 [options]"
    echo "Options:"
    echo "  -t, --target <stack|vllm>     Target to benchmark (default: stack)"
    echo "  -d, --duration <seconds>      Duration in seconds (default: 60)"
    echo "  -c, --concurrent <users>      Number of concurrent users (default: 10)"
    echo "  -h, --help                    Show this help message"
    echo ""
    echo "Examples:"
    echo "  $0 --target vllm              # Benchmark vLLM direct"
    echo "  $0 --target stack             # Benchmark Llama Stack (default)"
    echo "  $0 -t vllm -d 120 -c 20       # vLLM with 120s duration, 20 users"
 }
 while [[ $# -gt 0 ]]; do
    case $1 in
        -t|--target)
            TARGET="$2"
            shift 2
            ;;
        -d|--duration)
            DURATION="$2"
            shift 2
            ;;
        -c|--concurrent)
            CONCURRENT="$2"
            shift 2
            ;;
        -h|--help)
            usage
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            usage
            exit 1
            ;;
    esac
 done
 # Validate target
 if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
    echo "Error: Target must be 'stack' or 'vllm'"
    usage
    exit 1
 fi
 # Set configuration based on target
 if [[ "$TARGET" == "vllm" ]]; then
    BASE_URL="http://vllm-server:8000/v1"
    JOB_NAME="vllm-benchmark-job"
    echo "Benchmarking vLLM direct..."
 else
    BASE_URL="http://llama-stack-benchmark-service:8323/v1/openai/v1"
    JOB_NAME="stack-benchmark-job"
    echo "Benchmarking Llama Stack..."
 fi
 echo "Configuration:"
 echo "  Target: $TARGET"
 echo "  Base URL: $BASE_URL"
 echo "  Duration: ${DURATION}s"
 echo "  Concurrent users: $CONCURRENT"
 echo ""
 # Create temporary job yaml
 TEMP_YAML="/tmp/benchmark-job-temp-$(date +%s).yaml"
 cat > "$TEMP_YAML" << EOF
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: $JOB_NAME
  namespace: default
 spec:
  template:
    spec:
      containers:
      - name: benchmark
        image: python:3.11-slim
        command: ["/bin/bash"]
        args:
        - "-c"
        - |
          pip install aiohttp &&
          python3 /benchmark/benchmark.py \\
            --base-url $BASE_URL \\
            --model \${INFERENCE_MODEL} \\
            --duration $DURATION \\
            --concurrent $CONCURRENT
        env:
        - name: INFERENCE_MODEL
          value: "meta-llama/Llama-3.2-3B-Instruct"
        volumeMounts:
        - name: benchmark-script
          mountPath: /benchmark
        resources:
          requests:
            memory: "256Mi"
            cpu: "250m"
          limits:
            memory: "512Mi"
            cpu: "500m"
      volumes:
      - name: benchmark-script
        configMap:
          name: benchmark-script
      restartPolicy: Never
  backoffLimit: 3
 EOF
 echo "Creating benchmark ConfigMap..."
 kubectl create configmap benchmark-script \
  --from-file=benchmark.py=benchmark.py \
  --dry-run=client -o yaml | kubectl apply -f -
 echo "Cleaning up any existing benchmark job..."
 kubectl delete job $JOB_NAME 2>/dev/null || true
 echo "Deploying benchmark Job..."
 kubectl apply -f "$TEMP_YAML"
 echo "Waiting for job to start..."
 kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=60s
 echo "Following benchmark logs..."
 kubectl logs -f job/$JOB_NAME
 echo "Job completed. Checking final status..."
 kubectl get job $JOB_NAME
 # Clean up temporary file
 rm -f "$TEMP_YAML"
--- a/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
@ -1,133 +0,0 @@
 apiVersion: v1
 data:
  stack_run_config.yaml: |
    version: '2'
    image_name: kubernetes-benchmark-demo
    apis:
    - agents
    - inference
    - safety
    - telemetry
    - tool_runtime
    - vector_io
    providers:
      inference:
      - provider_id: vllm-inference
        provider_type: remote::vllm
        config:
          url: ${env.VLLM_URL:=http://localhost:8000/v1}
          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
          api_token: ${env.VLLM_API_TOKEN:=fake}
          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
      - provider_id: vllm-safety
        provider_type: remote::vllm
        config:
          url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
          api_token: ${env.VLLM_API_TOKEN:=fake}
          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
      - provider_id: sentence-transformers
        provider_type: inline::sentence-transformers
        config: {}
      vector_io:
      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
        provider_type: remote::chromadb
        config:
          url: ${env.CHROMADB_URL:=}
          kvstore:
            type: postgres
            host: ${env.POSTGRES_HOST:=localhost}
            port: ${env.POSTGRES_PORT:=5432}
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
      safety:
      - provider_id: llama-guard
        provider_type: inline::llama-guard
        config:
          excluded_categories: []
      agents:
      - provider_id: meta-reference
        provider_type: inline::meta-reference
        config:
          persistence_store:
            type: postgres
            host: ${env.POSTGRES_HOST:=localhost}
            port: ${env.POSTGRES_PORT:=5432}
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
          responses_store:
            type: postgres
            host: ${env.POSTGRES_HOST:=localhost}
            port: ${env.POSTGRES_PORT:=5432}
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
      telemetry:
      - provider_id: meta-reference
        provider_type: inline::meta-reference
        config:
          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
          sinks: ${env.TELEMETRY_SINKS:=console}
      tool_runtime:
      - provider_id: brave-search
        provider_type: remote::brave-search
        config:
          api_key: ${env.BRAVE_SEARCH_API_KEY:+}
          max_results: 3
      - provider_id: tavily-search
        provider_type: remote::tavily-search
        config:
          api_key: ${env.TAVILY_SEARCH_API_KEY:+}
          max_results: 3
      - provider_id: rag-runtime
        provider_type: inline::rag-runtime
        config: {}
      - provider_id: model-context-protocol
        provider_type: remote::model-context-protocol
        config: {}
    metadata_store:
      type: postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
      table_name: llamastack_kvstore
    inference_store:
      type: postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
    models:
    - metadata:
        embedding_dimension: 384
      model_id: all-MiniLM-L6-v2
      provider_id: sentence-transformers
      model_type: embedding
    - model_id: ${env.INFERENCE_MODEL}
      provider_id: vllm-inference
      model_type: llm
    - model_id: ${env.SAFETY_MODEL}
      provider_id: vllm-safety
      model_type: llm
    shields:
    - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
    vector_dbs: []
    datasets: []
    scoring_fns: []
    benchmarks: []
    tool_groups:
    - toolgroup_id: builtin::websearch
      provider_id: tavily-search
    - toolgroup_id: builtin::rag
      provider_id: rag-runtime
    server:
      port: 8323
 kind: ConfigMap
 metadata:
  creationTimestamp: null
  name: llama-stack-config
--- a/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template
@ -1,83 +0,0 @@
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: llama-benchmark-pvc
 spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 1Gi
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: llama-stack-benchmark-server
 spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: llama-stack-benchmark
      app.kubernetes.io/component: server
  template:
    metadata:
      labels:
        app.kubernetes.io/name: llama-stack-benchmark
        app.kubernetes.io/component: server
    spec:
      containers:
      - name: llama-stack-benchmark
        image: llamastack/distribution-starter:latest
        imagePullPolicy: Always # since we have specified latest instead of a version
        env:
        - name: ENABLE_CHROMADB
          value: "true"
        - name: CHROMADB_URL
          value: http://chromadb.default.svc.cluster.local:6000
        - name: POSTGRES_HOST
          value: postgres-server.default.svc.cluster.local
        - name: POSTGRES_PORT
          value: "5432"
        - name: INFERENCE_MODEL
          value: "${INFERENCE_MODEL}"
        - name: SAFETY_MODEL
          value: "${SAFETY_MODEL}"
        - name: TAVILY_SEARCH_API_KEY
          value: "${TAVILY_SEARCH_API_KEY}"
        - name: VLLM_URL
          value: http://vllm-server.default.svc.cluster.local:8000/v1
        - name: VLLM_MAX_TOKENS
          value: "3072"
        - name: VLLM_SAFETY_URL
          value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
        - name: VLLM_TLS_VERIFY
          value: "false"
        command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
        ports:
          - containerPort: 8323
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.llama
          - name: llama-config
            mountPath: /etc/config
      volumes:
      - name: llama-storage
        persistentVolumeClaim:
          claimName: llama-benchmark-pvc
      - name: llama-config
        configMap:
          name: llama-stack-config
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: llama-stack-benchmark-service
 spec:
  selector:
    app.kubernetes.io/name: llama-stack-benchmark
    app.kubernetes.io/component: server
  ports:
  - name: http
    port: 8323
    targetPort: 8323
  type: ClusterIP
--- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
@ -1,108 +0,0 @@
 version: '2'
 image_name: kubernetes-benchmark-demo
 apis:
 - agents
 - inference
 - telemetry
 - tool_runtime
 - vector_io
 providers:
  inference:
  - provider_id: vllm-inference
    provider_type: remote::vllm
    config:
      url: ${env.VLLM_URL:=http://localhost:8000/v1}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
    config: {}
  vector_io:
  - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
      kvstore:
        type: postgres
        host: ${env.POSTGRES_HOST:=localhost}
        port: ${env.POSTGRES_PORT:=5432}
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        type: postgres
        host: ${env.POSTGRES_HOST:=localhost}
        port: ${env.POSTGRES_PORT:=5432}
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
      responses_store:
        type: postgres
        host: ${env.POSTGRES_HOST:=localhost}
        port: ${env.POSTGRES_PORT:=5432}
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
      sinks: ${env.TELEMETRY_SINKS:=console}
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
    config:
      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
      max_results: 3
  - provider_id: tavily-search
    provider_type: remote::tavily-search
    config:
      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
      max_results: 3
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
    config: {}
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
    config: {}
 metadata_store:
  type: postgres
  host: ${env.POSTGRES_HOST:=localhost}
  port: ${env.POSTGRES_PORT:=5432}
  db: ${env.POSTGRES_DB:=llamastack}
  user: ${env.POSTGRES_USER:=llamastack}
  password: ${env.POSTGRES_PASSWORD:=llamastack}
  table_name: llamastack_kvstore
 inference_store:
  type: postgres
  host: ${env.POSTGRES_HOST:=localhost}
  port: ${env.POSTGRES_PORT:=5432}
  db: ${env.POSTGRES_DB:=llamastack}
  user: ${env.POSTGRES_USER:=llamastack}
  password: ${env.POSTGRES_PASSWORD:=llamastack}
 models:
 - metadata:
    embedding_dimension: 384
  model_id: all-MiniLM-L6-v2
  provider_id: sentence-transformers
  model_type: embedding
 - model_id: ${env.INFERENCE_MODEL}
  provider_id: vllm-inference
  model_type: llm
 vector_dbs: []
 datasets: []
 scoring_fns: []
 benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
  provider_id: tavily-search
 - toolgroup_id: builtin::rag
  provider_id: rag-runtime
 server:
  port: 8323
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@ -40,19 +40,19 @@ spec:
          value: "3072"
        - name: VLLM_SAFETY_URL
          value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
        - name: VLLM_TLS_VERIFY
          value: "false"
        - name: POSTGRES_HOST
          value: postgres-server.default.svc.cluster.local
        - name: POSTGRES_PORT
          value: "5432"
        - name: VLLM_TLS_VERIFY
          value: "false"
        - name: INFERENCE_MODEL
          value: "${INFERENCE_MODEL}"
        - name: SAFETY_MODEL
          value: "${SAFETY_MODEL}"
        - name: TAVILY_SEARCH_API_KEY
          value: "${TAVILY_SEARCH_API_KEY}"
-        command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8321"]
+        command: ["python", "-m", "llama_stack.core.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"]
        ports:
          - containerPort: 8321
        volumeMounts:
--- a/docs/source/distributions/ondevice_distro/android_sdk.md
+++ b/docs/source/distributions/ondevice_distro/android_sdk.md
@ -56,12 +56,12 @@ Breaking down the demo app, this section will show the core pieces that are used
 ### Setup Remote Inferencing
 Start a Llama Stack server on localhost. Here is an example of how you can do this using the firework.ai distribution:
 ```
-uv venv starter --python 3.12
+python -m venv stack-fireworks
-source starter/bin/activate  # On Windows: starter\Scripts\activate
+source stack-fireworks/bin/activate  # On Windows: stack-fireworks\Scripts\activate
 pip install --no-cache llama-stack==0.2.2
-llama stack build --distro starter --image-type venv
+llama stack build --distro fireworks --image-type venv
 export FIREWORKS_API_KEY=<SOME_KEY>
-llama stack run starter --port 5050
+llama stack run fireworks --port 5050
 ```
 Ensure the Llama Stack server version is the same as the Kotlin SDK Library for maximum compatibility.
--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@ -157,7 +157,7 @@ docker run \
 If you've set up your local development environment, you can also build the image using your local virtual environment.
 ```bash
-INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
+INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
 llama stack build --distro nvidia --image-type venv
 llama stack run ./run.yaml \
  --port 8321 \
--- a/docs/source/getting_started/demo_script.py
+++ b/docs/source/getting_started/demo_script.py
@ -52,16 +52,11 @@ agent = Agent(
 prompt = "How do you do great work?"
 print("prompt>", prompt)
 use_stream = True
 response = agent.create_turn(
    messages=[{"role": "user", "content": prompt}],
    session_id=agent.create_session("rag_session"),
-    stream=use_stream,
+    stream=True,
 )
-# Only call `AgentEventLogger().log(response)` for streaming responses.
+for log in AgentEventLogger().log(response):
-if use_stream:
+    log.print()
    for log in AgentEventLogger().log(response):
        log.print()
 else:
    print(response)
--- a/docs/source/getting_started/detailed_tutorial.md
+++ b/docs/source/getting_started/detailed_tutorial.md
@ -150,7 +150,13 @@ pip install llama-stack-client
 ```
 :::
-
+:::{tab-item} Install with `venv`
 ```bash
 python -m venv stack-client
 source stack-client/bin/activate  # On Windows: stack-client\Scripts\activate
 pip install llama-stack-client
 ```
 :::
 ::::
 Now let's use the `llama-stack-client` [CLI](../references/llama_stack_client_cli_reference.md) to check the
--- a/docs/source/providers/agents/index.md
+++ b/docs/source/providers/agents/index.md
@ -2,15 +2,6 @@
 ## Overview
 Agents API for creating and interacting with agentic systems.
    Main functionalities provided by this API:
    - Create agents with specific instructions and ability to use tools.
    - Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn".
    - Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
    - Agents can be provided with various shields (see the Safety API for more details).
    - Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
 This section contains documentation for all available providers for the **agents** API.
 ## Providers
--- a/docs/source/providers/batches/index.md
+++ b/docs/source/providers/batches/index.md
@ -1,21 +0,0 @@
 # Batches
 ## Overview
 Protocol for batch processing API operations.
    The Batches API enables efficient processing of multiple requests in a single operation,
    particularly useful for processing large datasets, batch evaluation workflows, and
    cost-effective inference at scale.
    Note: This API is currently under active development and may undergo changes.
 This section contains documentation for all available providers for the **batches** API.
 ## Providers
 ```{toctree}
 :maxdepth: 1
 inline_reference
 ```
--- a/docs/source/providers/batches/inline_reference.md
+++ b/docs/source/providers/batches/inline_reference.md
@ -1,23 +0,0 @@
 # inline::reference
 ## Description
 Reference implementation of batches API with KVStore persistence.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Configuration for the key-value store backend. |
 | `max_concurrent_batches` | `<class 'int'>` | No | 1 | Maximum number of concurrent batches to process simultaneously. |
 | `max_concurrent_requests_per_batch` | `<class 'int'>` | No | 10 | Maximum number of concurrent requests to process per batch. |
 ## Sample Configuration
 ```yaml
 kvstore:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/batches.db
 ```
--- a/docs/source/providers/eval/index.md
+++ b/docs/source/providers/eval/index.md
@ -2,8 +2,6 @@
 ## Overview
 Llama Stack Evaluation API for running evaluations on model and agent candidates.
 This section contains documentation for all available providers for the **eval** API.
 ## Providers
--- a/docs/source/providers/external/external-providers-guide.md
+++ b/docs/source/providers/external/external-providers-guide.md
@ -226,7 +226,7 @@ uv init
 name = "llama-stack-provider-ollama"
 version = "0.1.0"
 description = "Ollama provider for Llama Stack"
-requires-python = ">=3.12"
+requires-python = ">=3.10"
 dependencies = ["llama-stack", "pydantic", "ollama", "aiohttp"]
 ```
--- a/docs/source/providers/files/inline_localfs.md
+++ b/docs/source/providers/files/inline_localfs.md
@ -8,7 +8,7 @@ Local filesystem-based file storage provider for managing files and documents lo
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `storage_dir` | `<class 'str'>` | No |  | Directory to store uploaded files |
+| `storage_dir` | `<class 'str'>` | No | PydanticUndefined | Directory to store uploaded files |
 | `metadata_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | SQL store configuration for file metadata |
 | `ttl_secs` | `<class 'int'>` | No | 31536000 |  |
--- a/docs/source/providers/inference/index.md
+++ b/docs/source/providers/inference/index.md
@ -2,12 +2,6 @@
 ## Overview
 Llama Stack Inference API for generating completions, chat completions, and embeddings.
    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate "raw" and "chat" (conversational) completions.
    - Embedding models: these models generate embeddings to be used for semantic search.
 This section contains documentation for all available providers for the **inference** API.
 ## Providers
@ -35,7 +29,6 @@ remote_runpod
 remote_sambanova
 remote_tgi
 remote_together
 remote_vertexai
 remote_vllm
 remote_watsonx
 ```
--- a/docs/source/providers/inference/remote_hf_endpoint.md
+++ b/docs/source/providers/inference/remote_hf_endpoint.md
@ -8,7 +8,7 @@ HuggingFace Inference Endpoints provider for dedicated model serving.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `endpoint_name` | `<class 'str'>` | No |  | The name of the Hugging Face Inference Endpoint in the format of '{namespace}/{endpoint_name}' (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided. |
+| `endpoint_name` | `<class 'str'>` | No | PydanticUndefined | The name of the Hugging Face Inference Endpoint in the format of '{namespace}/{endpoint_name}' (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided. |
 | `api_token` | `pydantic.types.SecretStr \| None` | No |  | Your Hugging Face user access token (will default to locally saved token if not provided) |
 ## Sample Configuration
--- a/docs/source/providers/inference/remote_hf_serverless.md
+++ b/docs/source/providers/inference/remote_hf_serverless.md
@ -8,7 +8,7 @@ HuggingFace Inference API serverless provider for on-demand model inference.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `huggingface_repo` | `<class 'str'>` | No |  | The model ID of the model on the Hugging Face Hub (e.g. 'meta-llama/Meta-Llama-3.1-70B-Instruct') |
+| `huggingface_repo` | `<class 'str'>` | No | PydanticUndefined | The model ID of the model on the Hugging Face Hub (e.g. 'meta-llama/Meta-Llama-3.1-70B-Instruct') |
 | `api_token` | `pydantic.types.SecretStr \| None` | No |  | Your Hugging Face user access token (will default to locally saved token if not provided) |
 ## Sample Configuration
--- a/docs/source/providers/inference/remote_tgi.md
+++ b/docs/source/providers/inference/remote_tgi.md
@ -8,7 +8,7 @@ Text Generation Inference (TGI) provider for HuggingFace model serving.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `url` | `<class 'str'>` | No |  | The URL for the TGI serving endpoint |
+| `url` | `<class 'str'>` | No | PydanticUndefined | The URL for the TGI serving endpoint |
 ## Sample Configuration
--- a/docs/source/providers/inference/remote_vertexai.md
+++ b/docs/source/providers/inference/remote_vertexai.md
@ -1,40 +0,0 @@
 # remote::vertexai
 ## Description
 Google Vertex AI inference provider enables you to use Google's Gemini models through Google Cloud's Vertex AI platform, providing several advantages:
 • Enterprise-grade security: Uses Google Cloud's security controls and IAM
 • Better integration: Seamless integration with other Google Cloud services
 • Advanced features: Access to additional Vertex AI features like model tuning and monitoring
 • Authentication: Uses Google Cloud Application Default Credentials (ADC) instead of API keys
 Configuration:
 - Set VERTEX_AI_PROJECT environment variable (required)
 - Set VERTEX_AI_LOCATION environment variable (optional, defaults to us-central1)
 - Use Google Cloud Application Default Credentials or service account key
 Authentication Setup:
 Option 1 (Recommended): gcloud auth application-default login
 Option 2: Set GOOGLE_APPLICATION_CREDENTIALS to service account key path
 Available Models:
 - vertex_ai/gemini-2.0-flash
 - vertex_ai/gemini-2.5-flash
 - vertex_ai/gemini-2.5-pro
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `project` | `<class 'str'>` | No |  | Google Cloud project ID for Vertex AI |
 | `location` | `<class 'str'>` | No | us-central1 | Google Cloud location for Vertex AI |
 ## Sample Configuration
 ```yaml
 project: ${env.VERTEX_AI_PROJECT:=}
 location: ${env.VERTEX_AI_LOCATION:=us-central1}
 ```
--- a/docs/source/providers/post_training/inline_huggingface.md
+++ b/docs/source/providers/post_training/inline_huggingface.md
@ -27,7 +27,7 @@ HuggingFace-based post-training provider for fine-tuning models using the Huggin
 | `dpo_beta` | `<class 'float'>` | No | 0.1 |  |
 | `use_reference_model` | `<class 'bool'>` | No | True |  |
 | `dpo_loss_type` | `Literal['sigmoid', 'hinge', 'ipo', 'kto_pair'` | No | sigmoid |  |
-| `dpo_output_dir` | `<class 'str'>` | No |  |  |
+| `dpo_output_dir` | `<class 'str'>` | No | ./checkpoints/dpo |  |
 ## Sample Configuration
@ -35,7 +35,6 @@ HuggingFace-based post-training provider for fine-tuning models using the Huggin
 checkpoint_format: huggingface
 distributed_backend: null
 device: cpu
 dpo_output_dir: ~/.llama/dummy/dpo_output
 ```
--- a/docs/source/providers/vector_io/inline_chromadb.md
+++ b/docs/source/providers/vector_io/inline_chromadb.md
@ -41,7 +41,7 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `db_path` | `<class 'str'>` | No |  |  |
+| `db_path` | `<class 'str'>` | No | PydanticUndefined |  |
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
 ## Sample Configuration
--- a/docs/source/providers/vector_io/inline_faiss.md
+++ b/docs/source/providers/vector_io/inline_faiss.md
@ -12,18 +12,6 @@ That means you'll get fast and efficient vector retrieval.
 - Lightweight and easy to use
 - Fully integrated with Llama Stack
 - GPU support
 - **Vector search** - FAISS supports pure vector similarity search using embeddings
 ## Search Modes
 **Supported:**
 - **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings
 **Not Supported:**
 - **Keyword Search** (`mode="keyword"`): Not supported by FAISS
 - **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS
 > **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality.
 ## Usage
--- a/docs/source/providers/vector_io/inline_meta-reference.md
+++ b/docs/source/providers/vector_io/inline_meta-reference.md
@ -21,7 +21,5 @@ kvstore:
 ## Deprecation Notice
-```{warning}
+⚠️ **Warning**: Please use the `inline::faiss` provider instead.
 Please use the `inline::faiss` provider instead.
 ```
--- a/docs/source/providers/vector_io/inline_milvus.md
+++ b/docs/source/providers/vector_io/inline_milvus.md
@ -10,7 +10,7 @@ Please refer to the remote provider documentation.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `db_path` | `<class 'str'>` | No |  |  |
+| `db_path` | `<class 'str'>` | No | PydanticUndefined |  |
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
 | `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
--- a/docs/source/providers/vector_io/inline_qdrant.md
+++ b/docs/source/providers/vector_io/inline_qdrant.md
@ -50,7 +50,7 @@ See the [Qdrant documentation](https://qdrant.tech/documentation/) for more deta
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `path` | `<class 'str'>` | No |  |  |
+| `path` | `<class 'str'>` | No | PydanticUndefined |  |
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
 ## Sample Configuration
--- a/docs/source/providers/vector_io/inline_sqlite-vec.md
+++ b/docs/source/providers/vector_io/inline_sqlite-vec.md
@ -205,7 +205,7 @@ See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) f
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `db_path` | `<class 'str'>` | No |  | Path to the SQLite database file |
+| `db_path` | `<class 'str'>` | No | PydanticUndefined | Path to the SQLite database file |
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
 ## Sample Configuration
--- a/docs/source/providers/vector_io/inline_sqlite_vec.md
+++ b/docs/source/providers/vector_io/inline_sqlite_vec.md
@ -10,7 +10,7 @@ Please refer to the sqlite-vec provider documentation.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `db_path` | `<class 'str'>` | No |  | Path to the SQLite database file |
+| `db_path` | `<class 'str'>` | No | PydanticUndefined | Path to the SQLite database file |
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
 ## Sample Configuration
@ -25,7 +25,5 @@ kvstore:
 ## Deprecation Notice
-```{warning}
+⚠️ **Warning**: Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.
 Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.
 ```
--- a/docs/source/providers/vector_io/remote_chromadb.md
+++ b/docs/source/providers/vector_io/remote_chromadb.md
@ -40,7 +40,7 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `url` | `str \| None` | No |  |  |
+| `url` | `str \| None` | No | PydanticUndefined |  |
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
 ## Sample Configuration
--- a/docs/source/providers/vector_io/remote_milvus.md
+++ b/docs/source/providers/vector_io/remote_milvus.md
@ -11,7 +11,6 @@ That means you're not limited to storing vectors in memory or in a separate serv
 - Easy to use
 - Fully integrated with Llama Stack
 - Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations)
 ## Usage
@ -102,92 +101,6 @@ vector_io:
 - **`client_pem_path`**: Path to the **client certificate** file (required for mTLS).
 - **`client_key_path`**: Path to the **client private key** file (required for mTLS).
 ## Search Modes
 Milvus supports three different search modes for both inline and remote configurations:
 ### Vector Search
 Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content.
 ```python
 # Vector search example
 search_response = client.vector_stores.search(
    vector_store_id=vector_store.id,
    query="What is machine learning?",
    search_mode="vector",
    max_num_results=5,
 )
 ```
 ### Keyword Search
 Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches.
 ```python
 # Keyword search example
 search_response = client.vector_stores.search(
    vector_store_id=vector_store.id,
    query="Python programming language",
    search_mode="keyword",
    max_num_results=5,
 )
 ```
 ### Hybrid Search
 Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching.
 #### Basic Hybrid Search
 ```python
 # Basic hybrid search example (uses RRF ranker with default impact_factor=60.0)
 search_response = client.vector_stores.search(
    vector_store_id=vector_store.id,
    query="neural networks in Python",
    search_mode="hybrid",
    max_num_results=5,
 )
 ```
 **Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009).
 #### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker
 RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results.
 ```python
 # Hybrid search with custom RRF parameters
 search_response = client.vector_stores.search(
    vector_store_id=vector_store.id,
    query="neural networks in Python",
    search_mode="hybrid",
    max_num_results=5,
    ranking_options={
        "ranker": {
            "type": "rrf",
            "impact_factor": 100.0,  # Higher values give more weight to top-ranked results
        }
    },
 )
 ```
 #### Hybrid Search with Weighted Ranker
 Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods.
 ```python
 # Hybrid search with weighted ranker
 search_response = client.vector_stores.search(
    vector_store_id=vector_store.id,
    query="neural networks in Python",
    search_mode="hybrid",
    max_num_results=5,
    ranking_options={
        "ranker": {
            "type": "weighted",
            "alpha": 0.7,  # 70% vector search, 30% keyword search
        }
    },
 )
 ```
 For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md).
 ## Documentation
 See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.
@ -198,16 +111,13 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `uri` | `<class 'str'>` | No |  | The URI of the Milvus server |
+| `uri` | `<class 'str'>` | No | PydanticUndefined | The URI of the Milvus server |
-| `token` | `str \| None` | No |  | The token of the Milvus server |
+| `token` | `str \| None` | No | PydanticUndefined | The token of the Milvus server |
 | `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
 | `config` | `dict` | No | {} | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. |
-```{note}
+> **Note**: This configuration class accepts additional fields beyond those listed above. You can pass any additional configuration options that will be forwarded to the underlying provider.
 This configuration class accepts additional fields beyond those listed above. You can pass any additional configuration options that will be forwarded to the underlying provider.
 ```
 ## Sample Configuration
--- a/docs/source/references/llama_cli_reference/download_models.md
+++ b/docs/source/references/llama_cli_reference/download_models.md
@ -19,7 +19,7 @@ You have two ways to install Llama Stack:
    cd ~/local
    git clone git@github.com:meta-llama/llama-stack.git
-    uv venv myenv --python 3.12
+    python -m venv myenv
    source myenv/bin/activate  # On Windows: myenv\Scripts\activate
    cd llama-stack
@ -128,9 +128,7 @@ llama download --source huggingface --model-id Prompt-Guard-86M --ignore-pattern
 **Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
-```{tip}
+> **Tip:** Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
 Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
 ```
 ## List the downloaded models
--- a/docs/source/references/llama_cli_reference/index.md
+++ b/docs/source/references/llama_cli_reference/index.md
@ -19,7 +19,7 @@ You have two ways to install Llama Stack:
    cd ~/local
    git clone git@github.com:meta-llama/llama-stack.git
-    uv venv myenv --python 3.12
+    python -m venv myenv
    source myenv/bin/activate  # On Windows: myenv\Scripts\activate
    cd llama-stack
@ -152,9 +152,7 @@ llama download --source huggingface --model-id Prompt-Guard-86M --ignore-pattern
 **Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
-```{tip}
+> **Tip:** Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
 Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
 ```
 ## List the downloaded models
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -706,7 +706,6 @@ class Agents(Protocol):
        temperature: float | None = None,
        text: OpenAIResponseText | None = None,
        tools: list[OpenAIResponseInputTool] | None = None,
        include: list[str] | None = None,
        max_infer_iters: int | None = 10,  # this is an extension to the OpenAI API
    ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
        """Create a new OpenAI response.
@ -714,7 +713,6 @@ class Agents(Protocol):
        :param input: Input message(s) to create the response.
        :param model: The underlying LLM used for completions.
        :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
        :param include: (Optional) Additional fields to include in the response.
        :returns: An OpenAIResponseObject.
        """
        ...
--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@ -170,23 +170,6 @@ class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
    type: Literal["web_search_call"] = "web_search_call"
 class OpenAIResponseOutputMessageFileSearchToolCallResults(BaseModel):
    """Search results returned by the file search operation.
    :param attributes: (Optional) Key-value attributes associated with the file
    :param file_id: Unique identifier of the file containing the result
    :param filename: Name of the file containing the result
    :param score: Relevance score for this search result (between 0 and 1)
    :param text: Text content of the search result
    """
    attributes: dict[str, Any]
    file_id: str
    filename: str
    score: float
    text: str
@json_schema_type
 class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
    """File search tool call output message for OpenAI responses.
@ -202,7 +185,7 @@ class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
    queries: list[str]
    status: str
    type: Literal["file_search_call"] = "file_search_call"
-    results: list[OpenAIResponseOutputMessageFileSearchToolCallResults] | None = None
+    results: list[dict[str, Any]] | None = None
@json_schema_type
@ -623,62 +606,6 @@ class OpenAIResponseObjectStreamResponseMcpCallCompleted(BaseModel):
    type: Literal["response.mcp_call.completed"] = "response.mcp_call.completed"
@json_schema_type
 class OpenAIResponseContentPartOutputText(BaseModel):
    type: Literal["output_text"] = "output_text"
    text: str
    # TODO: add annotations, logprobs, etc.
@json_schema_type
 class OpenAIResponseContentPartRefusal(BaseModel):
    type: Literal["refusal"] = "refusal"
    refusal: str
 OpenAIResponseContentPart = Annotated[
    OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal,
    Field(discriminator="type"),
 ]
 register_schema(OpenAIResponseContentPart, name="OpenAIResponseContentPart")
@json_schema_type
 class OpenAIResponseObjectStreamResponseContentPartAdded(BaseModel):
    """Streaming event for when a new content part is added to a response item.
    :param response_id: Unique identifier of the response containing this content
    :param item_id: Unique identifier of the output item containing this content part
    :param part: The content part that was added
    :param sequence_number: Sequential number for ordering streaming events
    :param type: Event type identifier, always "response.content_part.added"
    """
    response_id: str
    item_id: str
    part: OpenAIResponseContentPart
    sequence_number: int
    type: Literal["response.content_part.added"] = "response.content_part.added"
@json_schema_type
 class OpenAIResponseObjectStreamResponseContentPartDone(BaseModel):
    """Streaming event for when a content part is completed.
    :param response_id: Unique identifier of the response containing this content
    :param item_id: Unique identifier of the output item containing this content part
    :param part: The completed content part
    :param sequence_number: Sequential number for ordering streaming events
    :param type: Event type identifier, always "response.content_part.done"
    """
    response_id: str
    item_id: str
    part: OpenAIResponseContentPart
    sequence_number: int
    type: Literal["response.content_part.done"] = "response.content_part.done"
 OpenAIResponseObjectStream = Annotated[
    OpenAIResponseObjectStreamResponseCreated
    | OpenAIResponseObjectStreamResponseOutputItemAdded
@ -698,8 +625,6 @@ OpenAIResponseObjectStream = Annotated[
    | OpenAIResponseObjectStreamResponseMcpCallInProgress
    | OpenAIResponseObjectStreamResponseMcpCallFailed
    | OpenAIResponseObjectStreamResponseMcpCallCompleted
    | OpenAIResponseObjectStreamResponseContentPartAdded
    | OpenAIResponseObjectStreamResponseContentPartDone
    | OpenAIResponseObjectStreamResponseCompleted,
    Field(discriminator="type"),
 ]
--- a/llama_stack/apis/batches/init.py
+++ b/llama_stack/apis/batches/init.py
@ -1,9 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .batches import Batches, BatchObject, ListBatchesResponse
 __all__ = ["Batches", "BatchObject", "ListBatchesResponse"]
--- a/llama_stack/apis/batches/batches.py
+++ b/llama_stack/apis/batches/batches.py
@ -1,89 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Literal, Protocol, runtime_checkable
 from pydantic import BaseModel, Field
 from llama_stack.schema_utils import json_schema_type, webmethod
 try:
    from openai.types import Batch as BatchObject
 except ImportError as e:
    raise ImportError("OpenAI package is required for batches API. Please install it with: pip install openai") from e
@json_schema_type
 class ListBatchesResponse(BaseModel):
    """Response containing a list of batch objects."""
    object: Literal["list"] = "list"
    data: list[BatchObject] = Field(..., description="List of batch objects")
    first_id: str | None = Field(default=None, description="ID of the first batch in the list")
    last_id: str | None = Field(default=None, description="ID of the last batch in the list")
    has_more: bool = Field(default=False, description="Whether there are more batches available")
@runtime_checkable
 class Batches(Protocol):
    """Protocol for batch processing API operations.
    The Batches API enables efficient processing of multiple requests in a single operation,
    particularly useful for processing large datasets, batch evaluation workflows, and
    cost-effective inference at scale.
    Note: This API is currently under active development and may undergo changes.
    """
    @webmethod(route="/openai/v1/batches", method="POST")
    async def create_batch(
        self,
        input_file_id: str,
        endpoint: str,
        completion_window: Literal["24h"],
        metadata: dict[str, str] | None = None,
    ) -> BatchObject:
        """Create a new batch for processing multiple API requests.
        :param input_file_id: The ID of an uploaded file containing requests for the batch.
        :param endpoint: The endpoint to be used for all requests in the batch.
        :param completion_window: The time window within which the batch should be processed.
        :param metadata: Optional metadata for the batch.
        :returns: The created batch object.
        """
        ...
    @webmethod(route="/openai/v1/batches/{batch_id}", method="GET")
    async def retrieve_batch(self, batch_id: str) -> BatchObject:
        """Retrieve information about a specific batch.
        :param batch_id: The ID of the batch to retrieve.
        :returns: The batch object.
        """
        ...
    @webmethod(route="/openai/v1/batches/{batch_id}/cancel", method="POST")
    async def cancel_batch(self, batch_id: str) -> BatchObject:
        """Cancel a batch that is in progress.
        :param batch_id: The ID of the batch to cancel.
        :returns: The updated batch object.
        """
        ...
    @webmethod(route="/openai/v1/batches", method="GET")
    async def list_batches(
        self,
        after: str | None = None,
        limit: int = 20,
    ) -> ListBatchesResponse:
        """List all batches for the current user.
        :param after: A cursor for pagination; returns batches after this batch ID.
        :param limit: Number of batches to return (default 20, max 100).
        :returns: A list of batch objects.
        """
        ...
--- a/llama_stack/apis/common/errors.py
+++ b/llama_stack/apis/common/errors.py
@ -10,16 +10,6 @@
 #   3. All classes should propogate the inherited __init__ function otherwise via 'super().__init__(message)'
 class ResourceNotFoundError(ValueError):
    """generic exception for a missing Llama Stack resource"""
    def __init__(self, resource_name: str, resource_type: str, client_list: str) -> None:
        message = (
            f"{resource_type} '{resource_name}' not found. Use '{client_list}' to list available {resource_type}s."
        )
        super().__init__(message)
 class UnsupportedModelError(ValueError):
    """raised when model is not present in the list of supported models"""
@ -28,32 +18,38 @@ class UnsupportedModelError(ValueError):
        super().__init__(message)
-class ModelNotFoundError(ResourceNotFoundError):
+class ModelNotFoundError(ValueError):
    """raised when Llama Stack cannot find a referenced model"""
    def __init__(self, model_name: str) -> None:
-        super().__init__(model_name, "Model", "client.models.list()")
+        message = f"Model '{model_name}' not found. Use client.models.list() to list available models."
        super().__init__(message)
-class VectorStoreNotFoundError(ResourceNotFoundError):
+class VectorStoreNotFoundError(ValueError):
    """raised when Llama Stack cannot find a referenced vector store"""
    def __init__(self, vector_store_name: str) -> None:
-        super().__init__(vector_store_name, "Vector Store", "client.vector_dbs.list()")
+        message = f"Vector store '{vector_store_name}' not found. Use client.vector_dbs.list() to list available vector stores."
        super().__init__(message)
-class DatasetNotFoundError(ResourceNotFoundError):
+class DatasetNotFoundError(ValueError):
    """raised when Llama Stack cannot find a referenced dataset"""
    def __init__(self, dataset_name: str) -> None:
-        super().__init__(dataset_name, "Dataset", "client.datasets.list()")
+        message = f"Dataset '{dataset_name}' not found. Use client.datasets.list() to list available datasets."
        super().__init__(message)
-class ToolGroupNotFoundError(ResourceNotFoundError):
+class ToolGroupNotFoundError(ValueError):
    """raised when Llama Stack cannot find a referenced tool group"""
    def __init__(self, toolgroup_name: str) -> None:
-        super().__init__(toolgroup_name, "Tool Group", "client.toolgroups.list()")
+        message = (
            f"Tool group '{toolgroup_name}' not found. Use client.toolgroups.list() to list available tool groups."
        )
        super().__init__(message)
 class SessionNotFoundError(ValueError):
@ -62,20 +58,3 @@ class SessionNotFoundError(ValueError):
    def __init__(self, session_name: str) -> None:
        message = f"Session '{session_name}' not found or access denied."
        super().__init__(message)
 class ModelTypeError(TypeError):
    """raised when a model is present but not the correct type"""
    def __init__(self, model_name: str, model_type: str, expected_model_type: str) -> None:
        message = (
            f"Model '{model_name}' is of type '{model_type}' rather than the expected type '{expected_model_type}'"
        )
        super().__init__(message)
 class ConflictError(ValueError):
    """raised when an operation cannot be performed due to a conflict with the current state"""
    def __init__(self, message: str) -> None:
        super().__init__(message)
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@ -86,7 +86,6 @@ class Api(Enum, metaclass=DynamicApiMeta):
    :cvar inference: Text generation, chat completions, and embeddings
    :cvar safety: Content moderation and safety shields
    :cvar agents: Agent orchestration and execution
    :cvar batches: Batch processing for asynchronous API requests
    :cvar vector_io: Vector database operations and queries
    :cvar datasetio: Dataset input/output operations
    :cvar scoring: Model output evaluation and scoring
@ -109,7 +108,6 @@ class Api(Enum, metaclass=DynamicApiMeta):
    inference = "inference"
    safety = "safety"
    agents = "agents"
    batches = "batches"
    vector_io = "vector_io"
    datasetio = "datasetio"
    scoring = "scoring"
--- a/llama_stack/apis/files/files.py
+++ b/llama_stack/apis/files/files.py
@ -22,7 +22,6 @@ class OpenAIFilePurpose(StrEnum):
    """
    ASSISTANTS = "assistants"
    BATCH = "batch"
    # TODO: Add other purposes as needed
--- a/llama_stack/apis/safety/safety.py
+++ b/llama_stack/apis/safety/safety.py
@ -15,36 +15,6 @@ from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
@json_schema_type
 class ModerationObjectResults(BaseModel):
    """A moderation object.
    :param flagged: Whether any of the below categories are flagged.
    :param categories: A list of the categories, and whether they are flagged or not.
    :param category_applied_input_types: A list of the categories along with the input type(s) that the score applies to.
    :param category_scores: A list of the categories along with their scores as predicted by model.
    """
    flagged: bool
    categories: dict[str, bool] | None = None
    category_applied_input_types: dict[str, list[str]] | None = None
    category_scores: dict[str, float] | None = None
    user_message: str | None = None
    metadata: dict[str, Any] = Field(default_factory=dict)
@json_schema_type
 class ModerationObject(BaseModel):
    """A moderation object.
    :param id: The unique identifier for the moderation request.
    :param model: The model used to generate the moderation results.
    :param results: A list of moderation objects
    """
    id: str
    model: str
    results: list[ModerationObjectResults]
@json_schema_type
 class ViolationLevel(Enum):
    """Severity level of a safety violation.
@ -112,13 +82,3 @@ class Safety(Protocol):
        :returns: A RunShieldResponse.
        """
        ...
    @webmethod(route="/openai/v1/moderations", method="POST")
    async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
        """Classifies if text and/or image inputs are potentially harmful.
        :param input: Input (or inputs) to classify.
        Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models.
        :param model: The content moderation model you would like to use.
        :returns: A moderation object.
        """
        ...
--- a/llama_stack/apis/shields/shields.py
+++ b/llama_stack/apis/shields/shields.py
@ -83,11 +83,3 @@ class Shields(Protocol):
        :returns: A Shield.
        """
        ...
    @webmethod(route="/shields/{identifier:path}", method="DELETE")
    async def unregister_shield(self, identifier: str) -> None:
        """Unregister a shield.
        :param identifier: The identifier of the shield to unregister.
        """
        ...
--- a/llama_stack/core/build.py
+++ b/llama_stack/core/build.py
@ -5,6 +5,7 @@
 # the root directory of this source tree.
 import importlib.resources
 import logging
 import sys
 from pydantic import BaseModel
@ -16,10 +17,9 @@ from llama_stack.core.external import load_external_apis
 from llama_stack.core.utils.exec import run_command
 from llama_stack.core.utils.image_types import LlamaStackImageType
 from llama_stack.distributions.template import DistributionTemplate
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api
-log = get_logger(name=__name__, category="core")
+log = logging.getLogger(__name__)
 # These are the dependencies needed by the distribution server.
 # `llama-stack` is automatically installed by the installation script.
@ -91,7 +91,7 @@ def get_provider_dependencies(
 def print_pip_install_help(config: BuildConfig):
-    normal_deps, special_deps, _ = get_provider_dependencies(config)
+    normal_deps, special_deps = get_provider_dependencies(config)
    cprint(
        f"Please install needed dependencies using the following commands:\n\nuv pip install {' '.join(normal_deps)}",
--- a/llama_stack/core/build_conda_env.sh
+++ b/llama_stack/core/build_conda_env.sh
@ -0,0 +1,207 @@
 #!/bin/bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
 LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-}
 TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
 PYPI_VERSION=${PYPI_VERSION:-}
 # This timeout (in seconds) is necessary when installing PyTorch via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}
 set -euo pipefail
 # Define color codes
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 NC='\033[0m' # No Color
 SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
 source "$SCRIPT_DIR/common.sh"
 # Usage function
 usage() {
  echo "Usage: $0 --env-name <conda_env_name> --build-file-path <build_file_path> --normal-deps <pip_dependencies> [--external-provider-deps <external_provider_deps>] [--optional-deps <special_pip_deps>]"
  echo "Example: $0 --env-name my-conda-env --build-file-path ./my-stack-build.yaml --normal-deps 'numpy pandas scipy' --external-provider-deps 'foo' --optional-deps 'bar'"
  exit 1
 }
 # Parse arguments
 env_name=""
 build_file_path=""
 normal_deps=""
 external_provider_deps=""
 optional_deps=""
 while [[ $# -gt 0 ]]; do
  key="$1"
  case "$key" in
    --env-name)
      if [[ -z "$2" || "$2" == --* ]]; then
        echo "Error: --env-name requires a string value" >&2
        usage
      fi
      env_name="$2"
      shift 2
      ;;
    --build-file-path)
      if [[ -z "$2" || "$2" == --* ]]; then
        echo "Error: --build-file-path requires a string value" >&2
        usage
      fi
      build_file_path="$2"
      shift 2
      ;;
    --normal-deps)
      if [[ -z "$2" || "$2" == --* ]]; then
        echo "Error: --normal-deps requires a string value" >&2
        usage
      fi
      normal_deps="$2"
      shift 2
      ;;
    --external-provider-deps)
      if [[ -z "$2" || "$2" == --* ]]; then
        echo "Error: --external-provider-deps requires a string value" >&2
        usage
      fi
      external_provider_deps="$2"
      shift 2
      ;;
    --optional-deps)
      if [[ -z "$2" || "$2" == --* ]]; then
        echo "Error: --optional-deps requires a string value" >&2
        usage
      fi
      optional_deps="$2"
      shift 2
      ;;
    *)
      echo "Unknown option: $1" >&2
      usage
      ;;
  esac
 done
 # Check required arguments
 if [[ -z "$env_name" || -z "$build_file_path" || -z "$normal_deps" ]]; then
  echo "Error: --env-name, --build-file-path, and --normal-deps are required." >&2
  usage
 fi
 if [ -n "$LLAMA_STACK_DIR" ]; then
  echo "Using llama-stack-dir=$LLAMA_STACK_DIR"
 fi
 if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
  echo "Using llama-stack-client-dir=$LLAMA_STACK_CLIENT_DIR"
 fi
 ensure_conda_env_python310() {
  # Use only global variables set by flag parser
  local python_version="3.12"
  if ! is_command_available conda; then
    printf "${RED}Error: conda command not found. Is Conda installed and in your PATH?${NC}" >&2
    exit 1
  fi
  if conda env list | grep -q "^${env_name} "; then
    printf "Conda environment '${env_name}' exists. Checking Python version...\n"
    current_version=$(conda run -n "${env_name}" python --version 2>&1 | cut -d' ' -f2 | cut -d'.' -f1,2)
    if [ "$current_version" = "$python_version" ]; then
      printf "Environment '${env_name}' already has Python ${python_version}. No action needed.\n"
    else
      printf "Updating environment '${env_name}' to Python ${python_version}...\n"
      conda install -n "${env_name}" python="${python_version}" -y
    fi
  else
    printf "Conda environment '${env_name}' does not exist. Creating with Python ${python_version}...\n"
    conda create -n "${env_name}" python="${python_version}" -y
  fi
  eval "$(conda shell.bash hook)"
  conda deactivate && conda activate "${env_name}"
  "$CONDA_PREFIX"/bin/pip install uv
  if [ -n "$TEST_PYPI_VERSION" ]; then
    uv pip install fastapi libcst
    uv pip install --extra-index-url https://test.pypi.org/simple/ \
      llama-stack=="$TEST_PYPI_VERSION" \
      "$normal_deps"
    if [ -n "$optional_deps" ]; then
      IFS='#' read -ra parts <<<"$optional_deps"
      for part in "${parts[@]}"; do
        echo "$part"
        uv pip install $part
      done
    fi
    if [ -n "$external_provider_deps" ]; then
      IFS='#' read -ra parts <<<"$external_provider_deps"
      for part in "${parts[@]}"; do
        echo "$part"
        uv pip install "$part"
      done
    fi
  else
    if [ -n "$LLAMA_STACK_DIR" ]; then
      if [ ! -d "$LLAMA_STACK_DIR" ]; then
        printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: $LLAMA_STACK_DIR${NC}\n" >&2
        exit 1
      fi
      printf "Installing from LLAMA_STACK_DIR: $LLAMA_STACK_DIR\n"
      uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"
    else
      PYPI_VERSION="${PYPI_VERSION:-}"
      if [ -n "$PYPI_VERSION" ]; then
        SPEC_VERSION="llama-stack==${PYPI_VERSION}"
      else
        SPEC_VERSION="llama-stack"
      fi
      uv pip install --no-cache-dir "$SPEC_VERSION"
    fi
    if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
      if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then
        printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: $LLAMA_STACK_CLIENT_DIR${NC}\n" >&2
        exit 1
      fi
      printf "Installing from LLAMA_STACK_CLIENT_DIR: $LLAMA_STACK_CLIENT_DIR\n"
      uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
    fi
    printf "Installing pip dependencies\n"
    uv pip install $normal_deps
    if [ -n "$optional_deps" ]; then
      IFS='#' read -ra parts <<<"$optional_deps"
      for part in "${parts[@]}"; do
        echo "$part"
        uv pip install $part
      done
    fi
    if [ -n "$external_provider_deps" ]; then
      IFS='#' read -ra parts <<<"$external_provider_deps"
      for part in "${parts[@]}"; do
        echo "Getting provider spec for module: $part and installing dependencies"
        package_name=$(echo "$part" | sed 's/[<>=!].*//')
        python3 -c "
 import importlib
 import sys
 try:
    module = importlib.import_module(f'$package_name.provider')
    spec = module.get_provider_spec()
    if hasattr(spec, 'pip_packages') and spec.pip_packages:
        print('\\n'.join(spec.pip_packages))
 except Exception as e:
    print(f'Error getting provider spec for $package_name: {e}', file=sys.stderr)
 " | uv pip install -r -
      done
    fi
  fi
  mv "$build_file_path" "$CONDA_PREFIX"/llamastack-build.yaml
  echo "Build spec configuration saved at $CONDA_PREFIX/llamastack-build.yaml"
 }
 ensure_conda_env_python310 "$env_name" "$build_file_path" "$normal_deps" "$optional_deps" "$external_provider_deps"
--- a/llama_stack/core/build_venv.sh
+++ b/llama_stack/core/build_venv.sh
@ -151,37 +151,23 @@ run() {
    fi
  else
    if [ -n "$LLAMA_STACK_DIR" ]; then
-      # only warn if DIR does not start with "git+"
+      if [ ! -d "$LLAMA_STACK_DIR" ]; then
      if [ ! -d "$LLAMA_STACK_DIR" ] && [[ "$LLAMA_STACK_DIR" != git+* ]]; then
        printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_DIR" >&2
        exit 1
      fi
      printf "Installing from LLAMA_STACK_DIR: %s\n"  "$LLAMA_STACK_DIR"
-      # editable only if LLAMA_STACK_DIR does not start with "git+"
+      uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"
      if [[ "$LLAMA_STACK_DIR" != git+* ]]; then
        EDITABLE="-e"
      else
        EDITABLE=""
      fi
      uv pip install --no-cache-dir $EDITABLE "$LLAMA_STACK_DIR"
    else
      uv pip install --no-cache-dir llama-stack
    fi
    if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
-      # only warn if DIR does not start with "git+"
+      if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then
      if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ] && [[ "$LLAMA_STACK_CLIENT_DIR" != git+* ]]; then
        printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_CLIENT_DIR" >&2
        exit 1
      fi
      printf "Installing from LLAMA_STACK_CLIENT_DIR: %s\n" "$LLAMA_STACK_CLIENT_DIR"
-      # editable only if LLAMA_STACK_CLIENT_DIR does not start with "git+"
+      uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
      if [[ "$LLAMA_STACK_CLIENT_DIR" != git+* ]]; then
        EDITABLE="-e"
      else
        EDITABLE=""
      fi
      uv pip install --no-cache-dir $EDITABLE "$LLAMA_STACK_CLIENT_DIR"
    fi
    printf "Installing pip dependencies\n"
--- a/llama_stack/core/configure.py
+++ b/llama_stack/core/configure.py
@ -3,6 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import logging
 import textwrap
 from typing import Any
@ -20,10 +21,9 @@ from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars
 from llama_stack.core.utils.config_dirs import EXTERNAL_PROVIDERS_DIR
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.core.utils.prompt_for_config import prompt_for_config
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api, ProviderSpec
-logger = get_logger(name=__name__, category="core")
+logger = logging.getLogger(__name__)
 def configure_single_provider(registry: dict[str, ProviderSpec], provider: Provider) -> Provider:
--- a/llama_stack/core/library_client.py
+++ b/llama_stack/core/library_client.py
@ -7,7 +7,7 @@
 import asyncio
 import inspect
 import json
-import logging  # allow-direct-logging
+import logging
 import os
 import sys
 from concurrent.futures import ThreadPoolExecutor
@ -48,7 +48,6 @@ from llama_stack.core.stack import (
 from llama_stack.core.utils.config import redact_sensitive_fields
 from llama_stack.core.utils.context import preserve_contexts_async_generator
 from llama_stack.core.utils.exec import in_notebook
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.telemetry.tracing import (
    CURRENT_TRACE_CONTEXT,
    end_trace,
@ -56,7 +55,7 @@ from llama_stack.providers.utils.telemetry.tracing import (
    start_trace,
 )
-logger = get_logger(name=__name__, category="core")
+logger = logging.getLogger(__name__)
 T = TypeVar("T")
@ -381,17 +380,8 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        json_content = json.dumps(convert_pydantic_to_json_value(result))
        filtered_body = {k: v for k, v in body.items() if not isinstance(v, LibraryClientUploadFile)}
        status_code = httpx.codes.OK
        if options.method.upper() == "DELETE" and result is None:
            status_code = httpx.codes.NO_CONTENT
        if status_code == httpx.codes.NO_CONTENT:
            json_content = ""
        mock_response = httpx.Response(
-            status_code=status_code,
+            status_code=httpx.codes.OK,
            content=json_content.encode("utf-8"),
            headers={
                "Content-Type": "application/json",
--- a/llama_stack/core/request_headers.py
+++ b/llama_stack/core/request_headers.py
@ -6,15 +6,15 @@
 import contextvars
 import json
 import logging
 from contextlib import AbstractContextManager
 from typing import Any
 from llama_stack.core.datatypes import User
 from llama_stack.log import get_logger
 from .utils.dynamic import instantiate_class_type
-log = get_logger(name=__name__, category="core")
+log = logging.getLogger(__name__)
 # Context variable for request provider data and auth attributes
 PROVIDER_DATA_VAR = contextvars.ContextVar("provider_data", default=None)
--- a/llama_stack/core/resolver.py
+++ b/llama_stack/core/resolver.py
@ -8,7 +8,6 @@ import inspect
 from typing import Any
 from llama_stack.apis.agents import Agents
 from llama_stack.apis.batches import Batches
 from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
@ -76,7 +75,6 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
        Api.agents: Agents,
        Api.inference: Inference,
        Api.inspect: Inspect,
        Api.batches: Batches,
        Api.vector_io: VectorIO,
        Api.vector_dbs: VectorDBs,
        Api.models: Models,
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@ -7,7 +7,6 @@
 import asyncio
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from datetime import UTC, datetime
 from typing import Annotated, Any
 from openai.types.chat import ChatCompletionToolChoiceOptionParam as OpenAIChatCompletionToolChoiceOptionParam
@ -18,7 +17,7 @@ from llama_stack.apis.common.content_types import (
    InterleavedContent,
    InterleavedContentItem,
 )
-from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
+from llama_stack.apis.common.errors import ModelNotFoundError
 from llama_stack.apis.inference import (
    BatchChatCompletionResponse,
    BatchCompletionResponse,
@ -26,21 +25,14 @@ from llama_stack.apis.inference import (
    ChatCompletionResponseEventType,
    ChatCompletionResponseStreamChunk,
    CompletionMessage,
    CompletionResponse,
    CompletionResponseStreamChunk,
    EmbeddingsResponse,
    EmbeddingTaskType,
    Inference,
    ListOpenAIChatCompletionResponse,
    LogProbConfig,
    Message,
    OpenAIAssistantMessageParam,
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
    OpenAIChatCompletionToolCall,
    OpenAIChatCompletionToolCallFunction,
    OpenAIChoice,
    OpenAIChoiceLogprobs,
    OpenAICompletion,
    OpenAICompletionWithInputMessages,
    OpenAIEmbeddingsResponse,
@ -63,9 +55,10 @@ from llama_stack.models.llama.llama3.chat_format import ChatFormat
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
 from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
 from llama_stack.providers.utils.inference.inference_store import InferenceStore
 from llama_stack.providers.utils.inference.stream_utils import stream_and_store_openai_completion
 from llama_stack.providers.utils.telemetry.tracing import get_current_span
-logger = get_logger(name=__name__, category="inference")
+logger = get_logger(name=__name__, category="core")
 class InferenceRouter(Inference):
@ -126,7 +119,6 @@ class InferenceRouter(Inference):
        if span is None:
            logger.warning("No span found for token usage metrics")
            return []
        metrics = [
            ("prompt_tokens", prompt_tokens),
            ("completion_tokens", completion_tokens),
@ -140,7 +132,7 @@ class InferenceRouter(Inference):
                    span_id=span.span_id,
                    metric=metric_name,
                    value=value,
-                    timestamp=datetime.now(UTC),
+                    timestamp=time.time(),
                    unit="tokens",
                    attributes={
                        "model_id": model.model_id,
@ -177,15 +169,6 @@ class InferenceRouter(Inference):
            encoded = self.formatter.encode_content(messages)
        return len(encoded.tokens) if encoded and encoded.tokens else 0
    async def _get_model(self, model_id: str, expected_model_type: str) -> Model:
        """takes a model id and gets model after ensuring that it is accessible and of the correct type"""
        model = await self.routing_table.get_model(model_id)
        if model is None:
            raise ModelNotFoundError(model_id)
        if model.model_type != expected_model_type:
            raise ModelTypeError(model_id, model.model_type, expected_model_type)
        return model
    async def chat_completion(
        self,
        model_id: str,
@ -204,7 +187,11 @@ class InferenceRouter(Inference):
        )
        if sampling_params is None:
            sampling_params = SamplingParams()
-        model = await self._get_model(model_id, ModelType.llm)
+        model = await self.routing_table.get_model(model_id)
        if model is None:
            raise ModelNotFoundError(model_id)
        if model.model_type == ModelType.embedding:
            raise ValueError(f"Model '{model_id}' is an embedding model and does not support chat completions")
        if tool_config:
            if tool_choice and tool_choice != tool_config.tool_choice:
                raise ValueError("tool_choice and tool_config.tool_choice must match")
@ -247,26 +234,49 @@ class InferenceRouter(Inference):
        prompt_tokens = await self._count_tokens(messages, tool_config.tool_prompt_format)
        if stream:
            response_stream = await provider.chat_completion(**params)
            return self.stream_tokens_and_compute_metrics(
                response=response_stream,
                prompt_tokens=prompt_tokens,
                model=model,
                tool_prompt_format=tool_config.tool_prompt_format,
            )
-        response = await provider.chat_completion(**params)
+            async def stream_generator():
-        metrics = await self.count_tokens_and_compute_metrics(
+                completion_text = ""
-            response=response,
+                async for chunk in await provider.chat_completion(**params):
-            prompt_tokens=prompt_tokens,
+                    if chunk.event.event_type == ChatCompletionResponseEventType.progress:
-            model=model,
+                        if chunk.event.delta.type == "text":
-            tool_prompt_format=tool_config.tool_prompt_format,
+                            completion_text += chunk.event.delta.text
-        )
+                    if chunk.event.event_type == ChatCompletionResponseEventType.complete:
-        # these metrics will show up in the client response.
+                        completion_tokens = await self._count_tokens(
-        response.metrics = (
+                            [
-            metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
+                                CompletionMessage(
-        )
+                                    content=completion_text,
-        return response
+                                    stop_reason=StopReason.end_of_turn,
                                )
                            ],
                            tool_config.tool_prompt_format,
                        )
                        total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
                        metrics = await self._compute_and_log_token_usage(
                            prompt_tokens or 0,
                            completion_tokens or 0,
                            total_tokens,
                            model,
                        )
                        chunk.metrics = metrics if chunk.metrics is None else chunk.metrics + metrics
                    yield chunk
            return stream_generator()
        else:
            response = await provider.chat_completion(**params)
            completion_tokens = await self._count_tokens(
                [response.completion_message],
                tool_config.tool_prompt_format,
            )
            total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
            metrics = await self._compute_and_log_token_usage(
                prompt_tokens or 0,
                completion_tokens or 0,
                total_tokens,
                model,
            )
            response.metrics = metrics if response.metrics is None else response.metrics + metrics
            return response
    async def batch_chat_completion(
        self,
@ -306,7 +316,11 @@ class InferenceRouter(Inference):
        logger.debug(
            f"InferenceRouter.completion: {model_id=}, {stream=}, {content=}, {sampling_params=}, {response_format=}",
        )
-        model = await self._get_model(model_id, ModelType.llm)
+        model = await self.routing_table.get_model(model_id)
        if model is None:
            raise ModelNotFoundError(model_id)
        if model.model_type == ModelType.embedding:
            raise ValueError(f"Model '{model_id}' is an embedding model and does not support chat completions")
        provider = await self.routing_table.get_provider_impl(model_id)
        params = dict(
            model_id=model_id,
@ -318,20 +332,39 @@ class InferenceRouter(Inference):
        )
        prompt_tokens = await self._count_tokens(content)
-        response = await provider.completion(**params)
+
        if stream:
-            return self.stream_tokens_and_compute_metrics(
+
-                response=response,
+            async def stream_generator():
-                prompt_tokens=prompt_tokens,
+                completion_text = ""
-                model=model,
+                async for chunk in await provider.completion(**params):
                    if hasattr(chunk, "delta"):
                        completion_text += chunk.delta
                    if hasattr(chunk, "stop_reason") and chunk.stop_reason and self.telemetry:
                        completion_tokens = await self._count_tokens(completion_text)
                        total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
                        metrics = await self._compute_and_log_token_usage(
                            prompt_tokens or 0,
                            completion_tokens or 0,
                            total_tokens,
                            model,
                        )
                        chunk.metrics = metrics if chunk.metrics is None else chunk.metrics + metrics
                    yield chunk
            return stream_generator()
        else:
            response = await provider.completion(**params)
            completion_tokens = await self._count_tokens(response.content)
            total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
            metrics = await self._compute_and_log_token_usage(
                prompt_tokens or 0,
                completion_tokens or 0,
                total_tokens,
                model,
            )
-
+            response.metrics = metrics if response.metrics is None else response.metrics + metrics
-        metrics = await self.count_tokens_and_compute_metrics(
+            return response
            response=response, prompt_tokens=prompt_tokens, model=model
        )
        response.metrics = metrics if response.metrics is None else response.metrics + metrics
        return response
    async def batch_completion(
        self,
@ -356,7 +389,11 @@ class InferenceRouter(Inference):
        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        logger.debug(f"InferenceRouter.embeddings: {model_id}")
-        await self._get_model(model_id, ModelType.embedding)
+        model = await self.routing_table.get_model(model_id)
        if model is None:
            raise ModelNotFoundError(model_id)
        if model.model_type == ModelType.llm:
            raise ValueError(f"Model '{model_id}' is an LLM model and does not support embeddings")
        provider = await self.routing_table.get_provider_impl(model_id)
        return await provider.embeddings(
            model_id=model_id,
@ -392,7 +429,12 @@ class InferenceRouter(Inference):
        logger.debug(
            f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}",
        )
-        model_obj = await self._get_model(model, ModelType.llm)
+        model_obj = await self.routing_table.get_model(model)
        if model_obj is None:
            raise ModelNotFoundError(model)
        if model_obj.model_type == ModelType.embedding:
            raise ValueError(f"Model '{model}' is an embedding model and does not support completions")
        params = dict(
            model=model_obj.identifier,
            prompt=prompt,
@ -415,29 +457,9 @@ class InferenceRouter(Inference):
            prompt_logprobs=prompt_logprobs,
            suffix=suffix,
        )
        provider = await self.routing_table.get_provider_impl(model_obj.identifier)
-        if stream:
+        return await provider.openai_completion(**params)
            return await provider.openai_completion(**params)
            # TODO: Metrics do NOT work with openai_completion stream=True due to the fact
            # that we do not return an AsyncIterator, our tests expect a stream of chunks we cannot intercept currently.
            # response_stream = await provider.openai_completion(**params)
        response = await provider.openai_completion(**params)
        if self.telemetry:
            metrics = self._construct_metrics(
                prompt_tokens=response.usage.prompt_tokens,
                completion_tokens=response.usage.completion_tokens,
                total_tokens=response.usage.total_tokens,
                model=model_obj,
            )
            for metric in metrics:
                await self.telemetry.log_event(metric)
            # these metrics will show up in the client response.
            response.metrics = (
                metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
            )
        return response
    async def openai_chat_completion(
        self,
@ -468,7 +490,11 @@ class InferenceRouter(Inference):
        logger.debug(
            f"InferenceRouter.openai_chat_completion: {model=}, {stream=}, {messages=}",
        )
-        model_obj = await self._get_model(model, ModelType.llm)
+        model_obj = await self.routing_table.get_model(model)
        if model_obj is None:
            raise ModelNotFoundError(model)
        if model_obj.model_type == ModelType.embedding:
            raise ValueError(f"Model '{model}' is an embedding model and does not support chat completions")
        # Use the OpenAI client for a bit of extra input validation without
        # exposing the OpenAI client itself as part of our API surface
@ -511,38 +537,18 @@ class InferenceRouter(Inference):
            top_p=top_p,
            user=user,
        )
        provider = await self.routing_table.get_provider_impl(model_obj.identifier)
        if stream:
            response_stream = await provider.openai_chat_completion(**params)
-
+            if self.store:
-            # For streaming, the provider returns AsyncIterator[OpenAIChatCompletionChunk]
+                return stream_and_store_openai_completion(response_stream, model, self.store, messages)
-            # We need to add metrics to each chunk and store the final completion
+            return response_stream
-            return self.stream_tokens_and_compute_metrics_openai_chat(
+        else:
-                response=response_stream,
+            response = await self._nonstream_openai_chat_completion(provider, params)
-                model=model_obj,
+            if self.store:
-                messages=messages,
+                await self.store.store_chat_completion(response, messages)
-            )
+            return response
        response = await self._nonstream_openai_chat_completion(provider, params)
        # Store the response with the ID that will be returned to the client
        if self.store:
            await self.store.store_chat_completion(response, messages)
        if self.telemetry:
            metrics = self._construct_metrics(
                prompt_tokens=response.usage.prompt_tokens,
                completion_tokens=response.usage.completion_tokens,
                total_tokens=response.usage.total_tokens,
                model=model_obj,
            )
            for metric in metrics:
                await self.telemetry.log_event(metric)
            # these metrics will show up in the client response.
            response.metrics = (
                metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
            )
        return response
    async def openai_embeddings(
        self,
@ -555,7 +561,12 @@ class InferenceRouter(Inference):
        logger.debug(
            f"InferenceRouter.openai_embeddings: {model=}, input_type={type(input)}, {encoding_format=}, {dimensions=}",
        )
-        model_obj = await self._get_model(model, ModelType.embedding)
+        model_obj = await self.routing_table.get_model(model)
        if model_obj is None:
            raise ModelNotFoundError(model)
        if model_obj.model_type != ModelType.embedding:
            raise ValueError(f"Model '{model}' is not an embedding model")
        params = dict(
            model=model_obj.identifier,
            input=input,
@ -614,245 +625,3 @@ class InferenceRouter(Inference):
                    status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}"
                )
        return health_statuses
    async def stream_tokens_and_compute_metrics(
        self,
        response,
        prompt_tokens,
        model,
        tool_prompt_format: ToolPromptFormat | None = None,
    ) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None] | AsyncGenerator[CompletionResponseStreamChunk, None]:
        completion_text = ""
        async for chunk in response:
            complete = False
            if hasattr(chunk, "event"):  # only ChatCompletions have .event
                if chunk.event.event_type == ChatCompletionResponseEventType.progress:
                    if chunk.event.delta.type == "text":
                        completion_text += chunk.event.delta.text
                if chunk.event.event_type == ChatCompletionResponseEventType.complete:
                    complete = True
                    completion_tokens = await self._count_tokens(
                        [
                            CompletionMessage(
                                content=completion_text,
                                stop_reason=StopReason.end_of_turn,
                            )
                        ],
                        tool_prompt_format=tool_prompt_format,
                    )
            else:
                if hasattr(chunk, "delta"):
                    completion_text += chunk.delta
                if hasattr(chunk, "stop_reason") and chunk.stop_reason and self.telemetry:
                    complete = True
                    completion_tokens = await self._count_tokens(completion_text)
            # if we are done receiving tokens
            if complete:
                total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
                # Create a separate span for streaming completion metrics
                if self.telemetry:
                    # Log metrics in the new span context
                    completion_metrics = self._construct_metrics(
                        prompt_tokens=prompt_tokens,
                        completion_tokens=completion_tokens,
                        total_tokens=total_tokens,
                        model=model,
                    )
                    for metric in completion_metrics:
                        if metric.metric in [
                            "completion_tokens",
                            "total_tokens",
                        ]:  # Only log completion and total tokens
                            await self.telemetry.log_event(metric)
                        # Return metrics in response
                        async_metrics = [
                            MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics
                        ]
                        chunk.metrics = async_metrics if chunk.metrics is None else chunk.metrics + async_metrics
                else:
                    # Fallback if no telemetry
                    completion_metrics = self._construct_metrics(
                        prompt_tokens or 0,
                        completion_tokens or 0,
                        total_tokens,
                        model,
                    )
                    async_metrics = [
                        MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics
                    ]
                    chunk.metrics = async_metrics if chunk.metrics is None else chunk.metrics + async_metrics
            yield chunk
    async def count_tokens_and_compute_metrics(
        self,
        response: ChatCompletionResponse | CompletionResponse,
        prompt_tokens,
        model,
        tool_prompt_format: ToolPromptFormat | None = None,
    ):
        if isinstance(response, ChatCompletionResponse):
            content = [response.completion_message]
        else:
            content = response.content
        completion_tokens = await self._count_tokens(messages=content, tool_prompt_format=tool_prompt_format)
        total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
        # Create a separate span for completion metrics
        if self.telemetry:
            # Log metrics in the new span context
            completion_metrics = self._construct_metrics(
                prompt_tokens=prompt_tokens,
                completion_tokens=completion_tokens,
                total_tokens=total_tokens,
                model=model,
            )
            for metric in completion_metrics:
                if metric.metric in ["completion_tokens", "total_tokens"]:  # Only log completion and total tokens
                    await self.telemetry.log_event(metric)
            # Return metrics in response
            return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics]
        # Fallback if no telemetry
        metrics = self._construct_metrics(
            prompt_tokens or 0,
            completion_tokens or 0,
            total_tokens,
            model,
        )
        return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
    async def stream_tokens_and_compute_metrics_openai_chat(
        self,
        response: AsyncIterator[OpenAIChatCompletionChunk],
        model: Model,
        messages: list[OpenAIMessageParam] | None = None,
    ) -> AsyncIterator[OpenAIChatCompletionChunk]:
        """Stream OpenAI chat completion chunks, compute metrics, and store the final completion."""
        id = None
        created = None
        choices_data: dict[int, dict[str, Any]] = {}
        try:
            async for chunk in response:
                # Skip None chunks
                if chunk is None:
                    continue
                # Capture ID and created timestamp from first chunk
                if id is None and chunk.id:
                    id = chunk.id
                if created is None and chunk.created:
                    created = chunk.created
                # Accumulate choice data for final assembly
                if chunk.choices:
                    for choice_delta in chunk.choices:
                        idx = choice_delta.index
                        if idx not in choices_data:
                            choices_data[idx] = {
                                "content_parts": [],
                                "tool_calls_builder": {},
                                "finish_reason": None,
                                "logprobs_content_parts": [],
                            }
                        current_choice_data = choices_data[idx]
                        if choice_delta.delta:
                            delta = choice_delta.delta
                            if delta.content:
                                current_choice_data["content_parts"].append(delta.content)
                            if delta.tool_calls:
                                for tool_call_delta in delta.tool_calls:
                                    tc_idx = tool_call_delta.index
                                    if tc_idx not in current_choice_data["tool_calls_builder"]:
                                        current_choice_data["tool_calls_builder"][tc_idx] = {
                                            "id": None,
                                            "type": "function",
                                            "function_name_parts": [],
                                            "function_arguments_parts": [],
                                        }
                                    builder = current_choice_data["tool_calls_builder"][tc_idx]
                                    if tool_call_delta.id:
                                        builder["id"] = tool_call_delta.id
                                    if tool_call_delta.type:
                                        builder["type"] = tool_call_delta.type
                                    if tool_call_delta.function:
                                        if tool_call_delta.function.name:
                                            builder["function_name_parts"].append(tool_call_delta.function.name)
                                        if tool_call_delta.function.arguments:
                                            builder["function_arguments_parts"].append(
                                                tool_call_delta.function.arguments
                                            )
                        if choice_delta.finish_reason:
                            current_choice_data["finish_reason"] = choice_delta.finish_reason
                        if choice_delta.logprobs and choice_delta.logprobs.content:
                            current_choice_data["logprobs_content_parts"].extend(choice_delta.logprobs.content)
                # Compute metrics on final chunk
                if chunk.choices and chunk.choices[0].finish_reason:
                    completion_text = ""
                    for choice_data in choices_data.values():
                        completion_text += "".join(choice_data["content_parts"])
                    # Add metrics to the chunk
                    if self.telemetry and chunk.usage:
                        metrics = self._construct_metrics(
                            prompt_tokens=chunk.usage.prompt_tokens,
                            completion_tokens=chunk.usage.completion_tokens,
                            total_tokens=chunk.usage.total_tokens,
                            model=model,
                        )
                        for metric in metrics:
                            await self.telemetry.log_event(metric)
                yield chunk
        finally:
            # Store the final assembled completion
            if id and self.store and messages:
                assembled_choices: list[OpenAIChoice] = []
                for choice_idx, choice_data in choices_data.items():
                    content_str = "".join(choice_data["content_parts"])
                    assembled_tool_calls: list[OpenAIChatCompletionToolCall] = []
                    if choice_data["tool_calls_builder"]:
                        for tc_build_data in choice_data["tool_calls_builder"].values():
                            if tc_build_data["id"]:
                                func_name = "".join(tc_build_data["function_name_parts"])
                                func_args = "".join(tc_build_data["function_arguments_parts"])
                                assembled_tool_calls.append(
                                    OpenAIChatCompletionToolCall(
                                        id=tc_build_data["id"],
                                        type=tc_build_data["type"],
                                        function=OpenAIChatCompletionToolCallFunction(
                                            name=func_name, arguments=func_args
                                        ),
                                    )
                                )
                    message = OpenAIAssistantMessageParam(
                        role="assistant",
                        content=content_str if content_str else None,
                        tool_calls=assembled_tool_calls if assembled_tool_calls else None,
                    )
                    logprobs_content = choice_data["logprobs_content_parts"]
                    final_logprobs = OpenAIChoiceLogprobs(content=logprobs_content) if logprobs_content else None
                    assembled_choices.append(
                        OpenAIChoice(
                            finish_reason=choice_data["finish_reason"],
                            index=choice_idx,
                            message=message,
                            logprobs=final_logprobs,
                        )
                    )
                final_response = OpenAIChatCompletion(
                    id=id,
                    choices=assembled_choices,
                    created=created or int(time.time()),
                    model=model.identifier,
                    object="chat.completion",
                )
                logger.debug(f"InferenceRouter.completion_response: {final_response}")
                await self.store.store_chat_completion(final_response, messages)
--- a/llama_stack/core/routers/safety.py
+++ b/llama_stack/core/routers/safety.py
@ -6,9 +6,10 @@
 from typing import Any
-from llama_stack.apis.inference import Message
+from llama_stack.apis.inference import (
    Message,
 )
 from llama_stack.apis.safety import RunShieldResponse, Safety
 from llama_stack.apis.safety.safety import ModerationObject
 from llama_stack.apis.shields import Shield
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import RoutingTable
@ -42,10 +43,6 @@ class SafetyRouter(Safety):
        logger.debug(f"SafetyRouter.register_shield: {shield_id}")
        return await self.routing_table.register_shield(shield_id, provider_shield_id, provider_id, params)
    async def unregister_shield(self, identifier: str) -> None:
        logger.debug(f"SafetyRouter.unregister_shield: {identifier}")
        return await self.routing_table.unregister_shield(identifier)
    async def run_shield(
        self,
        shield_id: str,
@ -59,27 +56,3 @@ class SafetyRouter(Safety):
            messages=messages,
            params=params,
        )
    async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
        async def get_shield_id(self, model: str) -> str:
            """Get Shield id from model (provider_resource_id) of shield."""
            list_shields_response = await self.routing_table.list_shields()
            matches = [s.identifier for s in list_shields_response.data if model == s.provider_resource_id]
            if not matches:
                raise ValueError(f"No shield associated with provider_resource id {model}")
            if len(matches) > 1:
                raise ValueError(f"Multiple shields associated with provider_resource id {model}")
            return matches[0]
        shield_id = await get_shield_id(self, model)
        logger.debug(f"SafetyRouter.run_moderation: {shield_id}")
        provider = await self.routing_table.get_provider_impl(shield_id)
        response = await provider.run_moderation(
            input=input,
            model=model,
        )
        return response
--- a/llama_stack/core/routing_tables/common.py
+++ b/llama_stack/core/routing_tables/common.py
@ -60,8 +60,6 @@ async def unregister_object_from_provider(obj: RoutableObject, p: Any) -> None:
        return await p.unregister_vector_db(obj.identifier)
    elif api == Api.inference:
        return await p.unregister_model(obj.identifier)
    elif api == Api.safety:
        return await p.unregister_shield(obj.identifier)
    elif api == Api.datasetio:
        return await p.unregister_dataset(obj.identifier)
    elif api == Api.tool_runtime:
--- a/llama_stack/core/routing_tables/models.py
+++ b/llama_stack/core/routing_tables/models.py
@ -63,8 +63,6 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
    async def get_provider_impl(self, model_id: str) -> Any:
        model = await lookup_model(self, model_id)
        if model.provider_id not in self.impls_by_provider_id:
            raise ValueError(f"Provider {model.provider_id} not found in the routing table")
        return self.impls_by_provider_id[model.provider_id]
    async def register_model(
--- a/llama_stack/core/routing_tables/shields.py
+++ b/llama_stack/core/routing_tables/shields.py
@ -55,7 +55,3 @@ class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):
        )
        await self.register_object(shield)
        return shield
    async def unregister_shield(self, identifier: str) -> None:
        existing_shield = await self.get_shield(identifier)
        await self.unregister_object(existing_shield)
--- a/llama_stack/core/routing_tables/toolgroups.py
+++ b/llama_stack/core/routing_tables/toolgroups.py
@ -124,7 +124,10 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
        return toolgroup
    async def unregister_toolgroup(self, toolgroup_id: str) -> None:
-        await self.unregister_object(await self.get_tool_group(toolgroup_id))
+        tool_group = await self.get_tool_group(toolgroup_id)
        if tool_group is None:
            raise ToolGroupNotFoundError(toolgroup_id)
        await self.unregister_object(tool_group)
    async def shutdown(self) -> None:
        pass
--- a/llama_stack/core/routing_tables/vector_dbs.py
+++ b/llama_stack/core/routing_tables/vector_dbs.py
@ -8,7 +8,7 @@ from typing import Any
 from pydantic import TypeAdapter
-from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError, VectorStoreNotFoundError
+from llama_stack.apis.common.errors import ModelNotFoundError, VectorStoreNotFoundError
 from llama_stack.apis.models import ModelType
 from llama_stack.apis.resource import ResourceType
 from llama_stack.apis.vector_dbs import ListVectorDBsResponse, VectorDB, VectorDBs
@ -66,7 +66,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
        if model is None:
            raise ModelNotFoundError(embedding_model)
        if model.model_type != ModelType.embedding:
-            raise ModelTypeError(embedding_model, model.model_type, ModelType.embedding)
+            raise ValueError(f"Model {embedding_model} is not an embedding model")
        if "embedding_dimension" not in model.metadata:
            raise ValueError(f"Model {embedding_model} does not have an embedding dimension")
        vector_db_data = {
--- a/llama_stack/core/server/server.py
+++ b/llama_stack/core/server/server.py
@ -9,7 +9,7 @@ import asyncio
 import functools
 import inspect
 import json
-import logging  # allow-direct-logging
+import logging
 import os
 import ssl
 import sys
@ -21,18 +21,16 @@ from importlib.metadata import version as parse_version
 from pathlib import Path
 from typing import Annotated, Any, get_origin
 import httpx
 import rich.pretty
 import yaml
 from aiohttp import hdrs
-from fastapi import Body, FastAPI, HTTPException, Request, Response
+from fastapi import Body, FastAPI, HTTPException, Request
 from fastapi import Path as FastapiPath
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse, StreamingResponse
 from openai import BadRequestError
 from pydantic import BaseModel, ValidationError
 from llama_stack.apis.common.errors import ConflictError, ResourceNotFoundError
 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.cli.utils import add_config_distro_args, get_config_from_args
 from llama_stack.core.access_control.access_control import AccessDeniedError
@ -117,7 +115,7 @@ def translate_exception(exc: Exception) -> HTTPException | RequestValidationErro
    if isinstance(exc, RequestValidationError):
        return HTTPException(
-            status_code=httpx.codes.BAD_REQUEST,
+            status_code=400,
            detail={
                "errors": [
                    {
@ -129,25 +127,21 @@ def translate_exception(exc: Exception) -> HTTPException | RequestValidationErro
                ]
            },
        )
    elif isinstance(exc, ConflictError):
        return HTTPException(status_code=409, detail=str(exc))
    elif isinstance(exc, ResourceNotFoundError):
        return HTTPException(status_code=404, detail=str(exc))
    elif isinstance(exc, ValueError):
-        return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=f"Invalid value: {str(exc)}")
+        return HTTPException(status_code=400, detail=f"Invalid value: {str(exc)}")
    elif isinstance(exc, BadRequestError):
-        return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=str(exc))
+        return HTTPException(status_code=400, detail=str(exc))
    elif isinstance(exc, PermissionError | AccessDeniedError):
-        return HTTPException(status_code=httpx.codes.FORBIDDEN, detail=f"Permission denied: {str(exc)}")
+        return HTTPException(status_code=403, detail=f"Permission denied: {str(exc)}")
    elif isinstance(exc, asyncio.TimeoutError | TimeoutError):
-        return HTTPException(status_code=httpx.codes.GATEWAY_TIMEOUT, detail=f"Operation timed out: {str(exc)}")
+        return HTTPException(status_code=504, detail=f"Operation timed out: {str(exc)}")
    elif isinstance(exc, NotImplementedError):
-        return HTTPException(status_code=httpx.codes.NOT_IMPLEMENTED, detail=f"Not implemented: {str(exc)}")
+        return HTTPException(status_code=501, detail=f"Not implemented: {str(exc)}")
    elif isinstance(exc, AuthenticationRequiredError):
-        return HTTPException(status_code=httpx.codes.UNAUTHORIZED, detail=f"Authentication required: {str(exc)}")
+        return HTTPException(status_code=401, detail=f"Authentication required: {str(exc)}")
    else:
        return HTTPException(
-            status_code=httpx.codes.INTERNAL_SERVER_ERROR,
+            status_code=500,
            detail="Internal server error: An unexpected error occurred.",
        )
@ -186,6 +180,7 @@ async def sse_generator(event_gen_coroutine):
        event_gen = await event_gen_coroutine
        async for item in event_gen:
            yield create_sse_event(item)
            await asyncio.sleep(0.01)
    except asyncio.CancelledError:
        logger.info("Generator cancelled")
        if event_gen:
@ -241,10 +236,6 @@ def create_dynamic_typed_route(func: Any, method: str, route: str) -> Callable:
                    result = await maybe_await(value)
                    if isinstance(result, PaginatedResponse) and result.url is None:
                        result.url = route
                    if method.upper() == "DELETE" and result is None:
                        return Response(status_code=httpx.codes.NO_CONTENT)
                    return result
            except Exception as e:
                if logger.isEnabledFor(logging.DEBUG):
@ -361,7 +352,7 @@ class ClientVersionMiddleware:
                            await send(
                                {
                                    "type": "http.response.start",
-                                    "status": httpx.codes.UPGRADE_REQUIRED,
+                                    "status": 426,
                                    "headers": [[b"content-type", b"application/json"]],
                                }
                            )
--- a/llama_stack/core/utils/exec.py
+++ b/llama_stack/core/utils/exec.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-import importlib
+import logging
 import os
 import signal
 import subprocess
@ -12,9 +12,9 @@ import sys
 from termcolor import cprint
-from llama_stack.log import get_logger
+log = logging.getLogger(__name__)
-log = get_logger(name=__name__, category="core")
+import importlib
 def formulate_run_args(image_type: str, image_name: str) -> list:
--- a/Show more
+++ b/Show more
`@ -1,2 +1,2 @@`
	`# This file documents Triage members in the Llama Stack community`	`# This file documents Triage members in the Llama Stack community`
	`@franciscojavierarceo`	`@bbrowning @franciscojavierarceo @leseb`