Release candidate 0.2.17rc1

2025-08-21 09:23:13 +00:00 · 2025-08-05 01:31:58 +00:00
386 changed files with 22764 additions and 24955 deletions
--- a/.github/TRIAGERS.md
+++ b/.github/TRIAGERS.md
@ -1,2 +1,2 @@
 # This file documents Triage members in the Llama Stack community
- @franciscojavierarceo
+ @bbrowning @franciscojavierarceo @leseb
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -2,13 +2,9 @@ name: 'Run and Record Tests'
 description: 'Run integration tests and handle recording/artifact upload'

 inputs:
-  test-subdirs:
-    description: 'Comma-separated list of test subdirectories to run'
+  test-types:
+    description: 'JSON array of test types to run'
    required: true
-  test-pattern:
-    description: 'Regex pattern to pass to pytest -k'
-    required: false
-    default: ''
  stack-config:
    description: 'Stack configuration to use'
    required: true
@ -36,14 +32,12 @@ runs:
    - name: Run Integration Tests
      shell: bash
      run: |
-        uv run --no-sync ./scripts/integration-tests.sh \
+        ./scripts/integration-tests.sh \
          --stack-config '${{ inputs.stack-config }}' \
          --provider '${{ inputs.provider }}' \
-          --test-subdirs '${{ inputs.test-subdirs }}' \
-          --test-pattern '${{ inputs.test-pattern }}' \
+          --test-types '${{ inputs.test-types }}' \
          --inference-mode '${{ inputs.inference-mode }}' \
-          ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} \
-          | tee pytest-${{ inputs.inference-mode }}.log
+          ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }}


    - name: Commit and push recordings
@ -63,10 +57,10 @@ runs:
            git commit -m "Recordings update from CI"
          fi

-          git fetch origin ${{ github.ref_name }}
-          git rebase origin/${{ github.ref_name }}
+          git fetch origin ${{ github.event.pull_request.head.ref }}
+          git rebase origin/${{ github.event.pull_request.head.ref }}
          echo "Rebased successfully"
-          git push origin HEAD:${{ github.ref_name }}
+          git push origin HEAD:${{ github.event.pull_request.head.ref }}
          echo "Pushed successfully"
        else
          echo "No recording changes"
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -16,21 +16,19 @@ runs:
      uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
      with:
        python-version: ${{ inputs.python-version }}
+        activate-environment: true
        version: 0.7.6

    - name: Install dependencies
      shell: bash
      run: |
-        echo "Updating project dependencies via uv sync"
        uv sync --all-groups
-
-        echo "Installing ad-hoc dependencies"
-        uv pip install faiss-cpu
+        uv pip install ollama faiss-cpu

        # Install llama-stack-client-python based on the client-version input
        if [ "${{ inputs.client-version }}" = "latest" ]; then
          echo "Installing latest llama-stack-client-python from main branch"
-          uv pip install git+https://github.com/llamastack/llama-stack-client-python.git@main
+          uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
        elif [ "${{ inputs.client-version }}" = "published" ]; then
          echo "Installing published llama-stack-client-python from PyPI"
          uv pip install llama-stack-client
@ -39,5 +37,4 @@ runs:
          exit 1
        fi

-        echo "Installed llama packages"
-        uv pip list | grep llama
+        uv pip install -e .
--- a/.github/actions/setup-test-environment/action.yml
+++ b/.github/actions/setup-test-environment/action.yml
@ -42,22 +42,7 @@ runs:
    - name: Build Llama Stack
      shell: bash
      run: |
-        # Install llama-stack-client-python based on the client-version input
-        if [ "${{ inputs.client-version }}" = "latest" ]; then
-          echo "Installing latest llama-stack-client-python from main branch"
-          export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@main
-        elif [ "${{ inputs.client-version }}" = "published" ]; then
-          echo "Installing published llama-stack-client-python from PyPI"
-          unset LLAMA_STACK_CLIENT_DIR
-        else
-          echo "Invalid client-version: ${{ inputs.client-version }}"
-          exit 1
-        fi
-
-        echo "Building Llama Stack"
-
-        LLAMA_STACK_DIR=. \
-          uv run --no-sync llama stack build --template ci-tests --image-type venv
+        uv run llama stack build --template ci-tests --image-type venv

    - name: Configure git for commits
      shell: bash
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -9,7 +9,6 @@ updates:
      day: "saturday"
    commit-message:
      prefix: chore(github-deps)
-
  - package-ecosystem: "uv"
    directory: "/"
    schedule:
@ -20,14 +19,3 @@ updates:
      - python
    commit-message:
      prefix: chore(python-deps)
-
-  - package-ecosystem: npm
-    directory: "/llama_stack/ui"
-    schedule:
-      interval: "weekly"
-      day: "saturday"
-    labels:
-      - type/dependencies
-      - javascript
-    commit-message:
-      prefix: chore(ui-deps)
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -18,6 +18,5 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action |
 | Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module |
 | Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |
-| UI Tests | [ui-unit-tests.yml](ui-unit-tests.yml) | Run the UI test suite |
 | Unit Tests | [unit-tests.yml](unit-tests.yml) | Run the unit test suite |
 | Update ReadTheDocs | [update-readthedocs.yml](update-readthedocs.yml) | Update the Llama Stack ReadTheDocs site |
--- a/.github/workflows/changelog.yml
+++ b/.github/workflows/changelog.yml
@ -17,7 +17,7 @@ jobs:
      pull-requests: write  # for peter-evans/create-pull-request to create a PR
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          ref: main
          fetch-depth: 0
--- a/.github/workflows/install-script-ci.yml
+++ b/.github/workflows/install-script-ci.yml
@ -16,22 +16,21 @@ jobs:
  lint:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # 5.0.0
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
      - name: Run ShellCheck on install.sh
        run: shellcheck scripts/install.sh
  smoke-test-on-dev:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner

      - name: Build a single provider
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync \
-            llama stack build --template starter --image-type container --image-name test
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template starter --image-type container --image-name test

      - name: Run installer end-to-end
        run: |
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -10,7 +10,6 @@ on:
    paths:
      - 'distributions/**'
      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -31,7 +30,7 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.github/workflows/integration-sql-store-tests.yml
+++ b/.github/workflows/integration-sql-store-tests.yml
@ -44,7 +44,7 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -10,7 +10,6 @@ on:
    types: [opened, synchronize, reopened]
    paths:
      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
      - 'tests/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -32,14 +31,6 @@ on:
        description: 'Test against a specific provider'
        type: string
        default: 'ollama'
-      test-subdirs:
-        description: 'Comma-separated list of test subdirectories to run'
-        type: string
-        default: ''
-      test-pattern:
-        description: 'Regex pattern to pass to pytest -k'
-        type: string
-        default: ''

 concurrency:
  # Skip concurrency for pushes to main - each commit should be tested independently
@ -47,8 +38,27 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  discover-tests:
+    runs-on: ubuntu-latest
+    outputs:
+      test-types: ${{ steps.generate-test-types.outputs.test-types }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Generate test types
+        id: generate-test-types
+        run: |
+          # Get test directories dynamically, excluding non-test directories
+          # NOTE: we are excluding post_training since the tests take too long
+          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
+            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" |
+            sort | jq -R -s -c 'split("\n")[:-1]')
+          echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT

  run-replay-mode-tests:
+    needs: discover-tests
    runs-on: ubuntu-latest
    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}

@ -65,7 +75,7 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Setup test environment
        uses: ./.github/actions/setup-test-environment
@ -79,8 +89,7 @@ jobs:
      - name: Run tests
        uses: ./.github/actions/run-and-record-tests
        with:
-          test-subdirs: ${{ inputs.test-subdirs }}
-          test-pattern: ${{ inputs.test-pattern }}
+          test-types: ${{ needs.discover-tests.outputs.test-types }}
          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
          provider: ${{ matrix.provider }}
          inference-mode: 'replay'
--- a/.github/workflows/integration-vector-io-tests.yml
+++ b/.github/workflows/integration-vector-io-tests.yml
@ -9,17 +9,14 @@ on:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
      - 'tests/integration/vector_io/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/integration-vector-io-tests.yml' # This workflow
-  schedule:
-    - cron: '0 0 * * *'  # (test on python 3.13) Daily at 12 AM UTC

 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

 jobs:
@ -28,12 +25,12 @@ jobs:
    strategy:
      matrix:
        vector-io-provider: ["inline::faiss", "inline::sqlite-vec", "inline::milvus", "remote::chromadb", "remote::pgvector", "remote::weaviate", "remote::qdrant"]
-        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
+        python-version: ["3.12", "3.13"]
      fail-fast: false # we want to run all tests regardless of failure

    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -144,7 +141,7 @@ jobs:

      - name: Build Llama Stack
        run: |
-          uv run --no-sync llama stack build --template ci-tests --image-type venv
+          uv run llama stack build --template ci-tests --image-type venv

      - name: Check Storage and Memory Available Before Tests
        if: ${{ always() }}
@ -167,10 +164,9 @@ jobs:
          ENABLE_WEAVIATE: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'true' || '' }}
          WEAVIATE_CLUSTER_URL: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'localhost:8080' || '' }}
        run: |
-          uv run --no-sync \
-            pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
+          uv run pytest -sv --stack-config="inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
            tests/integration/vector_io \
-            --embedding-model inline::sentence-transformers/all-MiniLM-L6-v2
+            --embedding-model sentence-transformers/all-MiniLM-L6-v2

      - name: Check Storage and Memory Available After Tests
        if: ${{ always() }}
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -20,7 +20,7 @@ jobs:

    steps:
      - name: Checkout code
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          # For dependabot PRs, we need to checkout with a token that can push changes
          token: ${{ github.actor == 'dependabot[bot]' && secrets.GITHUB_TOKEN || github.token }}
@ -36,21 +36,6 @@ jobs:
            **/requirements*.txt
            .pre-commit-config.yaml

-      # npm ci may fail -
-      #   npm error `npm ci` can only install packages when your package.json and package-lock.json or npm-shrinkwrap.json are in sync. Please update your lock file with `npm install` before continuing.
-      #   npm error Invalid: lock file's llama-stack-client@0.2.17 does not satisfy llama-stack-client@0.2.18
-
-      # - name: Set up Node.js
-      #   uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
-      #   with:
-      #     node-version: '20'
-      #     cache: 'npm'
-      #     cache-dependency-path: 'llama_stack/ui/'
-
-      # - name: Install npm dependencies
-      #   run: npm ci
-      #   working-directory: llama_stack/ui
-
      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
        continue-on-error: true
        env:
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -36,7 +36,7 @@ jobs:
      distros: ${{ steps.set-matrix.outputs.distros }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Generate Distribution List
        id: set-matrix
@ -55,7 +55,7 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -79,7 +79,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -92,7 +92,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -117,7 +117,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -9,8 +9,6 @@ on:
  pull_request:
    branches:
      - main
-    paths-ignore:
-        - 'llama_stack/ui/**'

 jobs:
  build:
@ -21,10 +19,10 @@ jobs:

    steps:
    - name: Checkout repository
-      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

    - name: Install uv
-      uses: astral-sh/setup-uv@d9e0f98d3fc6adb07d1e3d37f3043649ddad06a1 # v6.5.0
+      uses: astral-sh/setup-uv@e92bafb6253dcd438e0484186d7669ea7a8ca1cc # v6.4.3
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
--- a/.github/workflows/record-integration-tests.yml
+++ b/.github/workflows/record-integration-tests.yml
@ -1,53 +1,93 @@
-# This workflow should be run manually when needing to re-record tests. This happens when you have
-#  - added a new test
-#  - or changed an existing test such that a new inference call is made
-# You should make a PR and then run this workflow on that PR branch. The workflow will re-record the
-# tests and commit the recordings to the PR branch.
 name: Integration Tests (Record)

 run-name: Run the integration test suite from tests/integration

 on:
+  pull_request:
+    branches: [ main ]
+    types: [opened, synchronize, labeled]
+    paths:
+      - 'llama_stack/**'
+      - 'tests/**'
+      - 'uv.lock'
+      - 'pyproject.toml'
+      - '.github/workflows/record-integration-tests.yml' # This workflow
+      - '.github/actions/setup-ollama/action.yml'
+      - '.github/actions/setup-test-environment/action.yml'
+      - '.github/actions/run-and-record-tests/action.yml'
  workflow_dispatch:
    inputs:
-      test-subdirs:
-        description: 'Comma-separated list of test subdirectories to run'
-        type: string
-        default: ''
      test-provider:
        description: 'Test against a specific provider'
        type: string
        default: 'ollama'
-      run-vision-tests:
-        description: 'Whether to run vision tests'
-        type: boolean
-        default: false
-      test-pattern:
-        description: 'Regex pattern to pass to pytest -k'
-        type: string
-        default: ''
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true

 jobs:
+  discover-tests:
+    if: contains(github.event.pull_request.labels.*.name, 're-record-tests') ||
+      contains(github.event.pull_request.labels.*.name, 're-record-vision-tests')
+    runs-on: ubuntu-latest
+    outputs:
+      test-types: ${{ steps.generate-test-types.outputs.test-types }}
+      matrix-modes: ${{ steps.generate-test-types.outputs.matrix-modes }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Generate test types
+        id: generate-test-types
+        run: |
+          # Get test directories dynamically, excluding non-test directories
+          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
+            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" |
+            sort | jq -R -s -c 'split("\n")[:-1]')
+          echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
+
+          labels=$(gh pr view ${{ github.event.pull_request.number }} --json labels --jq '.labels[].name')
+          echo "labels=$labels"
+
+          modes_array=()
+          if [[ $labels == *"re-record-vision-tests"* ]]; then
+            modes_array+=("vision")
+          fi
+          if [[ $labels == *"re-record-tests"* ]]; then
+            modes_array+=("non-vision")
+          fi
+
+          # Convert to JSON array
+          if [ ${#modes_array[@]} -eq 0 ]; then
+            matrix_modes="[]"
+          else
+            matrix_modes=$(printf '%s\n' "${modes_array[@]}" | jq -R -s -c 'split("\n")[:-1]')
+          fi
+          echo "matrix_modes=$matrix_modes"
+          echo "matrix-modes=$matrix_modes" >> $GITHUB_OUTPUT
+
+        env:
+          GH_TOKEN: ${{ github.token }}
+
  record-tests:
+    needs: discover-tests
    runs-on: ubuntu-latest

    permissions:
      contents: write

-    steps:
-      - name: Echo workflow inputs
-        run: |
-          echo "::group::Workflow Inputs"
-          echo "test-subdirs: ${{ inputs.test-subdirs }}"
-          echo "test-provider: ${{ inputs.test-provider }}"
-          echo "run-vision-tests: ${{ inputs.run-vision-tests }}"
-          echo "test-pattern: ${{ inputs.test-pattern }}"
-          echo "branch: ${{ github.ref_name }}"
-          echo "::endgroup::"
+    strategy:
+      fail-fast: false
+      matrix:
+        mode: ${{ fromJSON(needs.discover-tests.outputs.matrix-modes) }}

+    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
+          ref: ${{ github.event.pull_request.head.ref }}
          fetch-depth: 0

      - name: Setup test environment
@ -56,15 +96,14 @@ jobs:
          python-version: "3.12"  # Use single Python version for recording
          client-version: "latest"
          provider: ${{ inputs.test-provider || 'ollama' }}
-          run-vision-tests: ${{ inputs.run-vision-tests }}
+          run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
          inference-mode: 'record'

      - name: Run and record tests
        uses: ./.github/actions/run-and-record-tests
        with:
-          test-pattern: ${{ inputs.test-pattern }}
-          test-subdirs: ${{ inputs.test-subdirs }}
+          test-types: ${{ needs.discover-tests.outputs.test-types }}
          stack-config: 'server:ci-tests'  # recording must be done with server since more tests are run
          provider: ${{ inputs.test-provider || 'ollama' }}
          inference-mode: 'record'
-          run-vision-tests: ${{ inputs.run-vision-tests }}
+          run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
--- a/.github/workflows/semantic-pr.yml
+++ b/.github/workflows/semantic-pr.yml
@ -11,7 +11,7 @@ on:
      - synchronize

 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

 permissions:
@ -22,6 +22,6 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Check PR Title's semantic conformance
-        uses: amannn/action-semantic-pull-request@7f33ba792281b034f64e96f4c0b5496782dd3b37 # v6.1.0
+        uses: amannn/action-semantic-pull-request@0723387faaf9b38adef4775cd42cfd5155ed6017 # v5.5.3
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/test-external-provider-module.yml
+++ b/.github/workflows/test-external-provider-module.yml
@ -27,7 +27,7 @@ jobs:
        # container and point 'uv pip install' to the correct path...
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.github/workflows/test-external.yml
+++ b/.github/workflows/test-external.yml
@ -9,7 +9,6 @@ on:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -27,7 +26,7 @@ jobs:
        # container and point 'uv pip install' to the correct path...
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -44,11 +43,11 @@ jobs:

      - name: Print distro dependencies
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml --print-deps-only
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/build.yaml --print-deps-only

      - name: Build distro from config file
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/build.yaml

      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
--- a/.github/workflows/ui-unit-tests.yml
+++ b/.github/workflows/ui-unit-tests.yml
@ -1,55 +0,0 @@
-name: UI Tests
-
-run-name: Run the UI test suite
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'llama_stack/ui/**'
-      - '.github/workflows/ui-unit-tests.yml' # This workflow
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  ui-tests:
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        node-version: [22]
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Setup Node.js
-        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
-        with:
-          node-version: ${{ matrix.node-version }}
-          cache: 'npm'
-          cache-dependency-path: 'llama_stack/ui/package-lock.json'
-
-      - name: Install dependencies
-        working-directory: llama_stack/ui
-        run: npm ci
-
-      - name: Run linting
-        working-directory: llama_stack/ui
-        run: npm run lint
-
-      - name: Run format check
-        working-directory: llama_stack/ui
-        run: npm run format:check
-
-      - name: Run unit tests
-        working-directory: llama_stack/ui
-        env:
-          CI: true
-
-        run: npm test -- --coverage --watchAll=false --passWithNoTests
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@ -9,7 +9,6 @@ on:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
      - 'tests/unit/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -32,7 +31,7 @@ jobs:
          - "3.13"
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.github/workflows/update-readthedocs.yml
+++ b/.github/workflows/update-readthedocs.yml
@ -37,7 +37,7 @@ jobs:
      TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -2,7 +2,6 @@ exclude: 'build/'

 default_language_version:
    python: python3.12
-    node: "22"

 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
@ -146,50 +145,6 @@ repos:
        pass_filenames: false
        require_serial: true
        files: ^.github/workflows/.*$
-      # ui-prettier and ui-eslint are disabled until we can avoid `npm ci`, which is slow and may fail -
-      #   npm error `npm ci` can only install packages when your package.json and package-lock.json or npm-shrinkwrap.json are in sync. Please update your lock file with `npm install` before continuing.
-      #   npm error Invalid: lock file's llama-stack-client@0.2.17 does not satisfy llama-stack-client@0.2.18
-      # and until we have infra for installing prettier and next via npm -
-      #   Lint UI code with ESLint.....................................................Failed
-      #   - hook id: ui-eslint
-      #   - exit code: 127
-      #   > ui@0.1.0 lint
-      #   > next lint --fix --quiet
-      #   sh: line 1: next: command not found
-      #
-      # - id: ui-prettier
-      #   name: Format UI code with Prettier
-      #   entry: bash -c 'cd llama_stack/ui && npm ci && npm run format'
-      #   language: system
-      #   files: ^llama_stack/ui/.*\.(ts|tsx)$
-      #   pass_filenames: false
-      #   require_serial: true
-      # - id: ui-eslint
-      #   name: Lint UI code with ESLint
-      #   entry: bash -c 'cd llama_stack/ui && npm run lint -- --fix --quiet'
-      #   language: system
-      #   files: ^llama_stack/ui/.*\.(ts|tsx)$
-      #   pass_filenames: false
-      #   require_serial: true
-
-      - id: check-log-usage
-        name: Ensure 'llama_stack.log' usage for logging
-        entry: bash
-        language: system
-        types: [python]
-        pass_filenames: true
-        args:
-          - -c
-          - |
-            matches=$(grep -EnH '^[^#]*\b(import\s+logging|from\s+logging\b)' "$@" | grep -v -e '#\s*allow-direct-logging' || true)
-            if [ -n "$matches" ]; then
-              # GitHub Actions annotation format
-              while IFS=: read -r file line_num rest; do
-                echo "::error file=$file,line=$line_num::Do not use 'import logging' or 'from logging import' in $file. Use the custom log instead: from llama_stack.log import get_logger; logger = get_logger(). If direct logging is truly needed, add: # allow-direct-logging"
-              done <<< "$matches"
-              exit 1
-            fi
-            exit 0

 ci:
    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,82 +1,13 @@
-# Contributing to Llama Stack
+# Contributing to Llama-Stack
 We want to make contributing to this project as easy and transparent as
 possible.

-## Set up your development environment
-
-We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
-You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
-
-You can install the dependencies by running:
-
-```bash
-cd llama-stack
-uv sync --group dev
-uv pip install -e .
-source .venv/bin/activate
-```
-
-```{note}
-You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.12`).
-Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
-For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
-```
-
-Note that you can create a dotenv file `.env` that includes necessary environment variables:
-```
-LLAMA_STACK_BASE_URL=http://localhost:8321
-LLAMA_STACK_CLIENT_LOG=debug
-LLAMA_STACK_PORT=8321
-LLAMA_STACK_CONFIG=<provider-name>
-TAVILY_SEARCH_API_KEY=
-BRAVE_SEARCH_API_KEY=
-```
-
-And then use this dotenv file when running client SDK tests via the following:
-```bash
-uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
-```
-
-### Pre-commit Hooks
-
-We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
-
-```bash
-uv run pre-commit install
-```
-
-After that, pre-commit hooks will run automatically before each commit.
-
-Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
-
-```bash
-uv run pre-commit run --all-files
-```
-
-```{caution}
-Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
-```
-
 ## Discussions -> Issues -> Pull Requests

 We actively welcome your pull requests. However, please read the following. This is heavily inspired by [Ghostty](https://github.com/ghostty-org/ghostty/blob/main/CONTRIBUTING.md).

 If in doubt, please open a [discussion](https://github.com/meta-llama/llama-stack/discussions); we can always convert that to an issue later.

-### Issues
-We use GitHub issues to track public bugs. Please ensure your description is
-clear and has sufficient instructions to be able to reproduce the issue.
-
-Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
-disclosure of security bugs. In those cases, please go through the process
-outlined on that page and do not file a public issue.
-
-### Contributor License Agreement ("CLA")
-In order to accept your pull request, we need you to submit a CLA. You only need
-to do this once to work on any of Meta's open source projects.
-
-Complete your CLA here: <https://code.facebook.com/cla>
-
 **I'd like to contribute!**

 If you are new to the project, start by looking at the issues tagged with "good first issue". If you're interested
@ -120,15 +51,93 @@ Please avoid picking up too many issues at once. This helps you stay focused and

 Please keep pull requests (PRs) small and focused. If you have a large set of changes, consider splitting them into logically grouped, smaller PRs to facilitate review and testing.

-```{tip}
-As a general guideline:
- Experienced contributors should try to keep no more than 5 open PRs at a time.
- New contributors are encouraged to have only one open PR at a time until they’re familiar with the codebase and process.
+> [!TIP]
+> As a general guideline:
+> - Experienced contributors should try to keep no more than 5 open PRs at a time.
+> - New contributors are encouraged to have only one open PR at a time until they’re familiar with the codebase and process.
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Meta's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+
+## Set up your development environment
+
+We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
+You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
+
+You can install the dependencies by running:
+
+```bash
+cd llama-stack
+uv sync --group dev
+uv pip install -e .
+source .venv/bin/activate
 ```

-## Repository guidelines
+> [!NOTE]
+> You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.12`)
+> Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
+> For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).

-### Coding Style
+Note that you can create a dotenv file `.env` that includes necessary environment variables:
+```
+LLAMA_STACK_BASE_URL=http://localhost:8321
+LLAMA_STACK_CLIENT_LOG=debug
+LLAMA_STACK_PORT=8321
+LLAMA_STACK_CONFIG=<provider-name>
+TAVILY_SEARCH_API_KEY=
+BRAVE_SEARCH_API_KEY=
+```
+
+And then use this dotenv file when running client SDK tests via the following:
+```bash
+uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
+```
+
+## Pre-commit Hooks
+
+We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
+
+```bash
+uv run pre-commit install
+```
+
+After that, pre-commit hooks will run automatically before each commit.
+
+Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
+
+```bash
+uv run pre-commit run --all-files
+```
+
+> [!CAUTION]
+> Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
+
+## Running tests
+
+You can find the Llama Stack testing documentation [here](https://github.com/meta-llama/llama-stack/blob/main/tests/README.md).
+
+## Adding a new dependency to the project
+
+To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:
+
+```bash
+uv add foo
+uv sync
+```
+
+## Coding Style

 * Comments should provide meaningful insights into the code. Avoid filler comments that simply
  describe the next step, as they create unnecessary clutter, same goes for docstrings.
@ -148,11 +157,6 @@ As a general guideline:
  that describes the configuration. These descriptions will be used to generate the provider
  documentation.
 * When possible, use keyword arguments only when calling functions.
-* Llama Stack utilizes [custom Exception classes](llama_stack/apis/common/errors.py) for certain Resources that should be used where applicable.
-
-### License
-By contributing to Llama, you agree that your contributions will be licensed
-under the LICENSE file in the root directory of this source tree.

 ## Common Tasks

@ -205,4 +209,8 @@ If you modify or add new API endpoints, update the API documentation accordingly
 uv run ./docs/openapi_generator/run_openapi_generator.sh
 ```

-The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.
+The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.
+
+## License
+By contributing to Llama, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
--- a/README.md
+++ b/README.md
@ -9,7 +9,6 @@

 [**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)

-
 ### ✨🎉 Llama 4 Support  🎉✨
 We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.

@ -180,17 +179,3 @@ Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest
 Check out our client SDKs for connecting to a Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [typescript](https://github.com/meta-llama/llama-stack-client-typescript), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.

 You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.
-
-
-## 🌟 GitHub Star History
-## Star History
-
-[![Star History Chart](https://api.star-history.com/svg?repos=meta-llama/llama-stack&type=Date)](https://www.star-history.com/#meta-llama/llama-stack&Date)
-
-## ✨ Contributors
-
-Thanks to all of our amazing contributors!
-
-<a href="https://github.com/meta-llama/llama-stack/graphs/contributors">
-  <img src="https://contrib.rocks/image?repo=meta-llama/llama-stack" />
-</a>
--- a/docs/_static/js/keyboard_shortcuts.js
+++ b/docs/_static/js/keyboard_shortcuts.js
@ -1,14 +0,0 @@
-document.addEventListener('keydown', function(event) {
-  // command+K or ctrl+K
-  if ((event.metaKey || event.ctrlKey) && event.key === 'k') {
-    event.preventDefault();
-    document.querySelector('.search-input, .search-field, input[name="q"]').focus();
-  }
-
-  // forward slash
-  if (event.key === '/' &&
-      !event.target.matches('input, textarea, select')) {
-    event.preventDefault();
-    document.querySelector('.search-input, .search-field, input[name="q"]').focus();
-  }
-});
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -1452,40 +1452,6 @@
                        }
                    }
                ]
-            },
-            "delete": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Shields"
-                ],
-                "description": "Unregister a shield.",
-                "parameters": [
-                    {
-                        "name": "identifier",
-                        "in": "path",
-                        "description": "The identifier of the shield to unregister.",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
            }
        },
        "/v1/telemetry/traces/{trace_id}/spans/{span_id}": {
@ -4734,49 +4700,6 @@
                }
            }
        },
-        "/v1/openai/v1/moderations": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "A moderation object.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/ModerationObject"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Safety"
-                ],
-                "description": "Classifies if text and/or image inputs are potentially harmful.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/RunModerationRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
        "/v1/safety/run-shield": {
            "post": {
                "responses": {
@ -8293,60 +8216,28 @@
                        "type": "array",
                        "items": {
                            "type": "object",
-                            "properties": {
-                                "attributes": {
-                                    "type": "object",
-                                    "additionalProperties": {
-                                        "oneOf": [
-                                            {
-                                                "type": "null"
-                                            },
-                                            {
-                                                "type": "boolean"
-                                            },
-                                            {
-                                                "type": "number"
-                                            },
-                                            {
-                                                "type": "string"
-                                            },
-                                            {
-                                                "type": "array"
-                                            },
-                                            {
-                                                "type": "object"
-                                            }
-                                        ]
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
                                    },
-                                    "description": "(Optional) Key-value attributes associated with the file"
-                                },
-                                "file_id": {
-                                    "type": "string",
-                                    "description": "Unique identifier of the file containing the result"
-                                },
-                                "filename": {
-                                    "type": "string",
-                                    "description": "Name of the file containing the result"
-                                },
-                                "score": {
-                                    "type": "number",
-                                    "description": "Relevance score for this search result (between 0 and 1)"
-                                },
-                                "text": {
-                                    "type": "string",
-                                    "description": "Text content of the search result"
-                                }
-                            },
-                            "additionalProperties": false,
-                            "required": [
-                                "attributes",
-                                "file_id",
-                                "filename",
-                                "score",
-                                "text"
-                            ],
-                            "title": "OpenAIResponseOutputMessageFileSearchToolCallResults",
-                            "description": "Search results returned by the file search operation."
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
                        },
                        "description": "(Optional) Search results returned by the file search operation"
                    }
@ -8547,13 +8438,6 @@
                            "$ref": "#/components/schemas/OpenAIResponseInputTool"
                        }
                    },
-                    "include": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        },
-                        "description": "(Optional) Additional fields to include in the response."
-                    },
                    "max_infer_iters": {
                        "type": "integer"
                    }
@ -8821,61 +8705,6 @@
                "title": "OpenAIResponseOutputMessageMCPListTools",
                "description": "MCP list tools output message containing available tools from an MCP server."
            },
-            "OpenAIResponseContentPart": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/OpenAIResponseContentPartOutputText"
-                    },
-                    {
-                        "$ref": "#/components/schemas/OpenAIResponseContentPartRefusal"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "output_text": "#/components/schemas/OpenAIResponseContentPartOutputText",
-                        "refusal": "#/components/schemas/OpenAIResponseContentPartRefusal"
-                    }
-                }
-            },
-            "OpenAIResponseContentPartOutputText": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "output_text",
-                        "default": "output_text"
-                    },
-                    "text": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "text"
-                ],
-                "title": "OpenAIResponseContentPartOutputText"
-            },
-            "OpenAIResponseContentPartRefusal": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "refusal",
-                        "default": "refusal"
-                    },
-                    "refusal": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "refusal"
-                ],
-                "title": "OpenAIResponseContentPartRefusal"
-            },
            "OpenAIResponseObjectStream": {
                "oneOf": [
                    {
@ -8932,12 +8761,6 @@
                    {
                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted"
                    },
-                    {
-                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded"
-                    },
-                    {
-                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone"
-                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
                    }
@ -8963,8 +8786,6 @@
                        "response.mcp_call.in_progress": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress",
                        "response.mcp_call.failed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed",
                        "response.mcp_call.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted",
-                        "response.content_part.added": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded",
-                        "response.content_part.done": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone",
                        "response.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
                    }
                }
@ -8991,80 +8812,6 @@
                "title": "OpenAIResponseObjectStreamResponseCompleted",
                "description": "Streaming event indicating a response has been completed."
            },
-            "OpenAIResponseObjectStreamResponseContentPartAdded": {
-                "type": "object",
-                "properties": {
-                    "response_id": {
-                        "type": "string",
-                        "description": "Unique identifier of the response containing this content"
-                    },
-                    "item_id": {
-                        "type": "string",
-                        "description": "Unique identifier of the output item containing this content part"
-                    },
-                    "part": {
-                        "$ref": "#/components/schemas/OpenAIResponseContentPart",
-                        "description": "The content part that was added"
-                    },
-                    "sequence_number": {
-                        "type": "integer",
-                        "description": "Sequential number for ordering streaming events"
-                    },
-                    "type": {
-                        "type": "string",
-                        "const": "response.content_part.added",
-                        "default": "response.content_part.added",
-                        "description": "Event type identifier, always \"response.content_part.added\""
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "response_id",
-                    "item_id",
-                    "part",
-                    "sequence_number",
-                    "type"
-                ],
-                "title": "OpenAIResponseObjectStreamResponseContentPartAdded",
-                "description": "Streaming event for when a new content part is added to a response item."
-            },
-            "OpenAIResponseObjectStreamResponseContentPartDone": {
-                "type": "object",
-                "properties": {
-                    "response_id": {
-                        "type": "string",
-                        "description": "Unique identifier of the response containing this content"
-                    },
-                    "item_id": {
-                        "type": "string",
-                        "description": "Unique identifier of the output item containing this content part"
-                    },
-                    "part": {
-                        "$ref": "#/components/schemas/OpenAIResponseContentPart",
-                        "description": "The completed content part"
-                    },
-                    "sequence_number": {
-                        "type": "integer",
-                        "description": "Sequential number for ordering streaming events"
-                    },
-                    "type": {
-                        "type": "string",
-                        "const": "response.content_part.done",
-                        "default": "response.content_part.done",
-                        "description": "Event type identifier, always \"response.content_part.done\""
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "response_id",
-                    "item_id",
-                    "part",
-                    "sequence_number",
-                    "type"
-                ],
-                "title": "OpenAIResponseObjectStreamResponseContentPartDone",
-                "description": "Streaming event for when a content part is completed."
-            },
            "OpenAIResponseObjectStreamResponseCreated": {
                "type": "object",
                "properties": {
@ -14767,8 +14514,7 @@
            "OpenAIFilePurpose": {
                "type": "string",
                "enum": [
-                    "assistants",
-                    "batch"
+                    "assistants"
                ],
                "title": "OpenAIFilePurpose",
                "description": "Valid purpose values for OpenAI Files API."
@ -14845,8 +14591,7 @@
                    "purpose": {
                        "type": "string",
                        "enum": [
-                            "assistants",
-                            "batch"
+                            "assistants"
                        ],
                        "description": "The intended purpose of the file"
                    }
@ -16622,131 +16367,6 @@
                ],
                "title": "RunEvalRequest"
            },
-            "RunModerationRequest": {
-                "type": "object",
-                "properties": {
-                    "input": {
-                        "oneOf": [
-                            {
-                                "type": "string"
-                            },
-                            {
-                                "type": "array",
-                                "items": {
-                                    "type": "string"
-                                }
-                            }
-                        ],
-                        "description": "Input (or inputs) to classify. Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models."
-                    },
-                    "model": {
-                        "type": "string",
-                        "description": "The content moderation model you would like to use."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "input",
-                    "model"
-                ],
-                "title": "RunModerationRequest"
-            },
-            "ModerationObject": {
-                "type": "object",
-                "properties": {
-                    "id": {
-                        "type": "string",
-                        "description": "The unique identifier for the moderation request."
-                    },
-                    "model": {
-                        "type": "string",
-                        "description": "The model used to generate the moderation results."
-                    },
-                    "results": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ModerationObjectResults"
-                        },
-                        "description": "A list of moderation objects"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "id",
-                    "model",
-                    "results"
-                ],
-                "title": "ModerationObject",
-                "description": "A moderation object."
-            },
-            "ModerationObjectResults": {
-                "type": "object",
-                "properties": {
-                    "flagged": {
-                        "type": "boolean",
-                        "description": "Whether any of the below categories are flagged."
-                    },
-                    "categories": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "type": "boolean"
-                        },
-                        "description": "A list of the categories, and whether they are flagged or not."
-                    },
-                    "category_applied_input_types": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "type": "array",
-                            "items": {
-                                "type": "string"
-                            }
-                        },
-                        "description": "A list of the categories along with the input type(s) that the score applies to."
-                    },
-                    "category_scores": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "type": "number"
-                        },
-                        "description": "A list of the categories along with their scores as predicted by model."
-                    },
-                    "user_message": {
-                        "type": "string"
-                    },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "flagged",
-                    "metadata"
-                ],
-                "title": "ModerationObjectResults",
-                "description": "A moderation object."
-            },
            "RunShieldRequest": {
                "type": "object",
                "properties": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -999,31 +999,6 @@ paths:
          required: true
          schema:
            type: string
-    delete:
-      responses:
-        '200':
-          description: OK
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Shields
-      description: Unregister a shield.
-      parameters:
-        - name: identifier
-          in: path
-          description: >-
-            The identifier of the shield to unregister.
-          required: true
-          schema:
-            type: string
  /v1/telemetry/traces/{trace_id}/spans/{span_id}:
    get:
      responses:
@ -3358,36 +3333,6 @@ paths:
            schema:
              $ref: '#/components/schemas/RunEvalRequest'
        required: true
-  /v1/openai/v1/moderations:
-    post:
-      responses:
-        '200':
-          description: A moderation object.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ModerationObject'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Safety
-      description: >-
-        Classifies if text and/or image inputs are potentially harmful.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RunModerationRequest'
-        required: true
  /v1/safety/run-shield:
    post:
      responses:
@ -6021,44 +5966,14 @@ components:
          type: array
          items:
            type: object
-            properties:
-              attributes:
-                type: object
-                additionalProperties:
-                  oneOf:
-                    - type: 'null'
-                    - type: boolean
-                    - type: number
-                    - type: string
-                    - type: array
-                    - type: object
-                description: >-
-                  (Optional) Key-value attributes associated with the file
-              file_id:
-                type: string
-                description: >-
-                  Unique identifier of the file containing the result
-              filename:
-                type: string
-                description: Name of the file containing the result
-              score:
-                type: number
-                description: >-
-                  Relevance score for this search result (between 0 and 1)
-              text:
-                type: string
-                description: Text content of the search result
-            additionalProperties: false
-            required:
-              - attributes
-              - file_id
-              - filename
-              - score
-              - text
-            title: >-
-              OpenAIResponseOutputMessageFileSearchToolCallResults
-            description: >-
-              Search results returned by the file search operation.
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
          description: >-
            (Optional) Search results returned by the file search operation
      additionalProperties: false
@ -6218,12 +6133,6 @@ components:
          type: array
          items:
            $ref: '#/components/schemas/OpenAIResponseInputTool'
-        include:
-          type: array
-          items:
-            type: string
-          description: >-
-            (Optional) Additional fields to include in the response.
        max_infer_iters:
          type: integer
      additionalProperties: false
@ -6441,43 +6350,6 @@ components:
      title: OpenAIResponseOutputMessageMCPListTools
      description: >-
        MCP list tools output message containing available tools from an MCP server.
-    OpenAIResponseContentPart:
-      oneOf:
-        - $ref: '#/components/schemas/OpenAIResponseContentPartOutputText'
-        - $ref: '#/components/schemas/OpenAIResponseContentPartRefusal'
-      discriminator:
-        propertyName: type
-        mapping:
-          output_text: '#/components/schemas/OpenAIResponseContentPartOutputText'
-          refusal: '#/components/schemas/OpenAIResponseContentPartRefusal'
-    OpenAIResponseContentPartOutputText:
-      type: object
-      properties:
-        type:
-          type: string
-          const: output_text
-          default: output_text
-        text:
-          type: string
-      additionalProperties: false
-      required:
-        - type
-        - text
-      title: OpenAIResponseContentPartOutputText
-    OpenAIResponseContentPartRefusal:
-      type: object
-      properties:
-        type:
-          type: string
-          const: refusal
-          default: refusal
-        refusal:
-          type: string
-      additionalProperties: false
-      required:
-        - type
-        - refusal
-      title: OpenAIResponseContentPartRefusal
    OpenAIResponseObjectStream:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
@ -6498,8 +6370,6 @@ components:
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
-        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded'
-        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone'
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
      discriminator:
        propertyName: type
@ -6522,8 +6392,6 @@ components:
          response.mcp_call.in_progress: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
          response.mcp_call.failed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
          response.mcp_call.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
-          response.content_part.added: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded'
-          response.content_part.done: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone'
          response.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
    "OpenAIResponseObjectStreamResponseCompleted":
      type: object
@ -6545,76 +6413,6 @@ components:
        OpenAIResponseObjectStreamResponseCompleted
      description: >-
        Streaming event indicating a response has been completed.
-    "OpenAIResponseObjectStreamResponseContentPartAdded":
-      type: object
-      properties:
-        response_id:
-          type: string
-          description: >-
-            Unique identifier of the response containing this content
-        item_id:
-          type: string
-          description: >-
-            Unique identifier of the output item containing this content part
-        part:
-          $ref: '#/components/schemas/OpenAIResponseContentPart'
-          description: The content part that was added
-        sequence_number:
-          type: integer
-          description: >-
-            Sequential number for ordering streaming events
-        type:
-          type: string
-          const: response.content_part.added
-          default: response.content_part.added
-          description: >-
-            Event type identifier, always "response.content_part.added"
-      additionalProperties: false
-      required:
-        - response_id
-        - item_id
-        - part
-        - sequence_number
-        - type
-      title: >-
-        OpenAIResponseObjectStreamResponseContentPartAdded
-      description: >-
-        Streaming event for when a new content part is added to a response item.
-    "OpenAIResponseObjectStreamResponseContentPartDone":
-      type: object
-      properties:
-        response_id:
-          type: string
-          description: >-
-            Unique identifier of the response containing this content
-        item_id:
-          type: string
-          description: >-
-            Unique identifier of the output item containing this content part
-        part:
-          $ref: '#/components/schemas/OpenAIResponseContentPart'
-          description: The completed content part
-        sequence_number:
-          type: integer
-          description: >-
-            Sequential number for ordering streaming events
-        type:
-          type: string
-          const: response.content_part.done
-          default: response.content_part.done
-          description: >-
-            Event type identifier, always "response.content_part.done"
-      additionalProperties: false
-      required:
-        - response_id
-        - item_id
-        - part
-        - sequence_number
-        - type
-      title: >-
-        OpenAIResponseObjectStreamResponseContentPartDone
-      description: >-
-        Streaming event for when a content part is completed.
    "OpenAIResponseObjectStreamResponseCreated":
      type: object
      properties:
@ -10951,7 +10749,6 @@ components:
      type: string
      enum:
        - assistants
-        - batch
      title: OpenAIFilePurpose
      description: >-
        Valid purpose values for OpenAI Files API.
@ -11020,7 +10817,6 @@ components:
          type: string
          enum:
            - assistants
-            - batch
          description: The intended purpose of the file
      additionalProperties: false
      required:
@ -12363,96 +12159,6 @@ components:
      required:
        - benchmark_config
      title: RunEvalRequest
-    RunModerationRequest:
-      type: object
-      properties:
-        input:
-          oneOf:
-            - type: string
-            - type: array
-              items:
-                type: string
-          description: >-
-            Input (or inputs) to classify. Can be a single string, an array of strings,
-            or an array of multi-modal input objects similar to other models.
-        model:
-          type: string
-          description: >-
-            The content moderation model you would like to use.
-      additionalProperties: false
-      required:
-        - input
-        - model
-      title: RunModerationRequest
-    ModerationObject:
-      type: object
-      properties:
-        id:
-          type: string
-          description: >-
-            The unique identifier for the moderation request.
-        model:
-          type: string
-          description: >-
-            The model used to generate the moderation results.
-        results:
-          type: array
-          items:
-            $ref: '#/components/schemas/ModerationObjectResults'
-          description: A list of moderation objects
-      additionalProperties: false
-      required:
-        - id
-        - model
-        - results
-      title: ModerationObject
-      description: A moderation object.
-    ModerationObjectResults:
-      type: object
-      properties:
-        flagged:
-          type: boolean
-          description: >-
-            Whether any of the below categories are flagged.
-        categories:
-          type: object
-          additionalProperties:
-            type: boolean
-          description: >-
-            A list of the categories, and whether they are flagged or not.
-        category_applied_input_types:
-          type: object
-          additionalProperties:
-            type: array
-            items:
-              type: string
-          description: >-
-            A list of the categories along with the input type(s) that the score applies
-            to.
-        category_scores:
-          type: object
-          additionalProperties:
-            type: number
-          description: >-
-            A list of the categories along with their scores as predicted by model.
-        user_message:
-          type: string
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - flagged
-        - metadata
-      title: ModerationObjectResults
-      description: A moderation object.
    RunShieldRequest:
      type: object
      properties:
--- a/docs/source/apis/external.md
+++ b/docs/source/apis/external.md
@ -111,7 +111,7 @@ name = "llama-stack-api-weather"
 version = "0.1.0"
 description = "Weather API for Llama Stack"
 readme = "README.md"
-requires-python = ">=3.12"
+requires-python = ">=3.10"
 dependencies = ["llama-stack", "pydantic"]

 [build-system]
@ -231,7 +231,7 @@ name = "llama-stack-provider-kaze"
 version = "0.1.0"
 description = "Kaze weather provider for Llama Stack"
 readme = "README.md"
-requires-python = ">=3.12"
+requires-python = ">=3.10"
 dependencies = ["llama-stack", "pydantic", "aiohttp"]

 [build-system]
--- a/docs/source/building_applications/responses_vs_agents.md
+++ b/docs/source/building_applications/responses_vs_agents.md
@ -2,9 +2,7 @@

 Llama Stack (LLS) provides two different APIs for building AI applications with tool calling capabilities: the **Agents API** and the **OpenAI Responses API**. While both enable AI systems to use tools, and maintain full conversation history, they serve different use cases and have distinct characteristics.

-```{note}
-For simple and basic inferencing, you may want to use the [Chat Completions API](https://llama-stack.readthedocs.io/en/latest/providers/index.html#chat-completions) directly, before progressing to Agents or Responses API.
-```
+> **Note:** For simple and basic inferencing, you may want to use the [Chat Completions API](https://llama-stack.readthedocs.io/en/latest/providers/index.html#chat-completions) directly, before progressing to Agents or Responses API.

 ## Overview

--- a/docs/source/building_applications/tools.md
+++ b/docs/source/building_applications/tools.md
@ -76,9 +76,7 @@ Features:
 - Context retrieval with token limits


-```{note}
-By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
-```
+> **Note:** By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.

 ## Model Context Protocol (MCP)

--- a/docs/source/concepts/apis.md
+++ b/docs/source/concepts/apis.md
@ -18,4 +18,3 @@ We are working on adding a few more APIs to complete the application lifecycle.
 - **Batch Inference**: run inference on a dataset of inputs
 - **Batch Agents**: run agents on a dataset of inputs
 - **Synthetic Data Generation**: generate synthetic data for model development
- **Batches**: OpenAI-compatible batch management for inference
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -131,7 +131,6 @@ html_static_path = ["../_static"]
 def setup(app):
    app.add_css_file("css/my_theme.css")
    app.add_js_file("js/detect_theme.js")
-    app.add_js_file("js/keyboard_shortcuts.js")

    def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
        url = f"https://hub.docker.com/r/llamastack/{text}"
--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@ -2,38 +2,14 @@
 ```{include} ../../../CONTRIBUTING.md
 ```

-## Adding a New Provider
+See the [Adding a New API Provider](new_api_provider.md) which describes how to add new API providers to the Stack.
+

-See:
- [Adding a New API Provider Page](new_api_provider.md) which describes how to add new API providers to the Stack.
- [Vector Database Page](new_vector_database.md) which describes how to add a new vector databases with Llama Stack.
- [External Provider Page](../providers/external/index.md) which describes how to add external providers to the Stack.

 ```{toctree}
 :maxdepth: 1
 :hidden:

 new_api_provider
-new_vector_database
-```
-
-## Testing
-
-
-```{include} ../../../tests/README.md
-```
-
-## Advanced Topics
-
-For developers who need deeper understanding of the testing system internals:
-
-```{toctree}
-:maxdepth: 1
-
-testing/record-replay
-```
-
-### Benchmarking
-
-```{include} ../../../docs/source/distributions/k8s-benchmark/README.md
+testing
 ```
--- a/docs/source/contributing/new_vector_database.md
+++ b/docs/source/contributing/new_vector_database.md
@ -1,75 +0,0 @@
-# Adding a New Vector Database
-
-This guide will walk you through the process of adding a new vector database to Llama Stack.
-
-> **_NOTE:_** Here's an example Pull Request of the [Milvus Vector Database Provider](https://github.com/meta-llama/llama-stack/pull/1467).
-
-Vector Database providers are used to store and retrieve vector embeddings. Vector databases are not limited to vector
-search but can support keyword and hybrid search. Additionally, vector database can also support operations like
-filtering, sorting, and aggregating vectors.
-
-## Steps to Add a New Vector Database Provider
-1. **Choose the Database Type**: Determine if your vector database is a remote service, inline, or both.
-   - Remote databases make requests to external services, while inline databases execute locally. Some providers support both.
-2. **Implement the Provider**: Create a new provider class that inherits from `VectorDatabaseProvider` and implements the required methods.
-   - Implement methods for vector storage, retrieval, search, and any additional features your database supports.
-     - You will need to implement the following methods for `YourVectorIndex`:
-        - `YourVectorIndex.create()`
-        - `YourVectorIndex.initialize()`
-        - `YourVectorIndex.add_chunks()`
-        - `YourVectorIndex.delete_chunk()`
-        - `YourVectorIndex.query_vector()`
-        - `YourVectorIndex.query_keyword()`
-        - `YourVectorIndex.query_hybrid()`
-     - You will need to implement the following methods for `YourVectorIOAdapter`:
-        - `YourVectorIOAdapter.initialize()`
-        - `YourVectorIOAdapter.shutdown()`
-        - `YourVectorIOAdapter.list_vector_dbs()`
-        - `YourVectorIOAdapter.register_vector_db()`
-        - `YourVectorIOAdapter.unregister_vector_db()`
-        - `YourVectorIOAdapter.insert_chunks()`
-        - `YourVectorIOAdapter.query_chunks()`
-        - `YourVectorIOAdapter.delete_chunks()`
-3. **Add to Registry**: Register your provider in the appropriate registry file.
-   - Update {repopath}`llama_stack/providers/registry/vector_io.py` to include your new provider.
-```python
-from llama_stack.providers.registry.specs import InlineProviderSpec
-from llama_stack.providers.registry.api import Api
-
-InlineProviderSpec(
-    api=Api.vector_io,
-    provider_type="inline::milvus",
-    pip_packages=["pymilvus>=2.4.10"],
-    module="llama_stack.providers.inline.vector_io.milvus",
-    config_class="llama_stack.providers.inline.vector_io.milvus.MilvusVectorIOConfig",
-    api_dependencies=[Api.inference],
-    optional_api_dependencies=[Api.files],
-    description="",
-),
-```
-4. **Add Tests**: Create unit tests and integration tests for your provider in the `tests/` directory.
-   - Unit Tests
-     - By following the structure of the class methods, you will be able to easily run unit and integration tests for your database.
-       1. You have to configure the tests for your provide in `/tests/unit/providers/vector_io/conftest.py`.
-       2. Update the `vector_provider` fixture to include your provider if they are an inline provider.
-       3. Create a `your_vectorprovider_index` fixture that initializes your vector index.
-       4. Create a `your_vectorprovider_adapter` fixture that initializes your vector adapter.
-       5. Add your provider to the `vector_io_providers` fixture dictionary.
-         - Please follow the naming convention of `your_vectorprovider_index` and `your_vectorprovider_adapter` as the tests require this to execute properly.
-   - Integration Tests
-     - Integration tests are located in {repopath}`tests/integration`. These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality.
-     - The two set of integration tests are:
-       - `tests/integration/vector_io/test_vector_io.py`: This file tests registration, insertion, and retrieval.
-       - `tests/integration/vector_io/test_openai_vector_stores.py`: These tests are for OpenAI-compatible vector stores and test the OpenAI API compatibility.
-        - You will need to update `skip_if_provider_doesnt_support_openai_vector_stores` to include your provider as well as `skip_if_provider_doesnt_support_openai_vector_stores_search` to test the appropriate search functionality.
-     - Running the tests in the GitHub CI
-       - You will need to update the `.github/workflows/integration-vector-io-tests.yml` file to include your provider.
-        - If your provider is a remote provider, you will also have to add a container to spin up and run it in the action.
-   - Updating the pyproject.yml
-     - If you are adding tests for the `inline` provider you will have to update the `unit` group.
-       - `uv add new_pip_package --group unit`
-     - If you are adding tests for the `remote` provider you will have to update the `test` group, which is used in the GitHub CI for integration tests.
-       - `uv add new_pip_package --group test`
-5. **Update Documentation**: Please update the documentation for end users
-   - Generate the provider documentation by running {repopath}`./scripts/provider_codegen.py`.
-   - Update the autogenerated content in the registry/vector_io.py file with information about your provider. Please see other providers for examples.
--- a/docs/source/contributing/testing.md
+++ b/docs/source/contributing/testing.md
@ -0,0 +1,6 @@
+# Testing Llama Stack
+
+Tests are of three different kinds:
+- Unit tests
+- Provider focused integration tests
+- Client SDK tests
--- a/docs/source/contributing/testing/record-replay.md
+++ b/docs/source/contributing/testing/record-replay.md
@ -1,234 +0,0 @@
-# Record-Replay System
-
-Understanding how Llama Stack captures and replays API interactions for testing.
-
-## Overview
-
-The record-replay system solves a fundamental challenge in AI testing: how do you test against expensive, non-deterministic APIs without breaking the bank or dealing with flaky tests?
-
-The solution: intercept API calls, store real responses, and replay them later. This gives you real API behavior without the cost or variability.
-
-## How It Works
-
-### Request Hashing
-
-Every API request gets converted to a deterministic hash for lookup:
-
-```python
-def normalize_request(method: str, url: str, headers: dict, body: dict) -> str:
-    normalized = {
-        "method": method.upper(),
-        "endpoint": urlparse(url).path,  # Just the path, not full URL
-        "body": body,  # Request parameters
-    }
-    return hashlib.sha256(json.dumps(normalized, sort_keys=True).encode()).hexdigest()
-```
-
-**Key insight:** The hashing is intentionally precise. Different whitespace, float precision, or parameter order produces different hashes. This prevents subtle bugs from false cache hits.
-
-```python
-# These produce DIFFERENT hashes:
-{"content": "Hello world"}
-{"content": "Hello   world\n"}
-{"temperature": 0.7}
-{"temperature": 0.7000001}
-```
-
-### Client Interception
-
-The system patches OpenAI and Ollama client methods to intercept calls before they leave your application. This happens transparently - your test code doesn't change.
-
-### Storage Architecture
-
-Recordings use a two-tier storage system optimized for both speed and debuggability:
-
-```
-recordings/
-├── index.sqlite          # Fast lookup by request hash
-└── responses/
-    ├── abc123def456.json  # Individual response files
-    └── def789ghi012.json
-```
-
-**SQLite index** enables O(log n) hash lookups and metadata queries without loading response bodies.
-
-**JSON files** store complete request/response pairs in human-readable format for debugging.
-
-## Recording Modes
-
-### LIVE Mode
-
-Direct API calls with no recording or replay:
-
-```python
-with inference_recording(mode=InferenceMode.LIVE):
-    response = await client.chat.completions.create(...)
-```
-
-Use for initial development and debugging against real APIs.
-
-### RECORD Mode
-
-Captures API interactions while passing through real responses:
-
-```python
-with inference_recording(mode=InferenceMode.RECORD, storage_dir="./recordings"):
-    response = await client.chat.completions.create(...)
-    # Real API call made, response captured AND returned
-```
-
-The recording process:
-1. Request intercepted and hashed
-2. Real API call executed
-3. Response captured and serialized
-4. Recording stored to disk
-5. Original response returned to caller
-
-### REPLAY Mode
-
-Returns stored responses instead of making API calls:
-
-```python
-with inference_recording(mode=InferenceMode.REPLAY, storage_dir="./recordings"):
-    response = await client.chat.completions.create(...)
-    # No API call made, cached response returned instantly
-```
-
-The replay process:
-1. Request intercepted and hashed
-2. Hash looked up in SQLite index
-3. Response loaded from JSON file
-4. Response deserialized and returned
-5. Error if no recording found
-
-## Streaming Support
-
-Streaming APIs present a unique challenge: how do you capture an async generator?
-
-### The Problem
-
-```python
-# How do you record this?
-async for chunk in client.chat.completions.create(stream=True):
-    process(chunk)
-```
-
-### The Solution
-
-The system captures all chunks immediately before yielding any:
-
-```python
-async def handle_streaming_record(response):
-    # Capture complete stream first
-    chunks = []
-    async for chunk in response:
-        chunks.append(chunk)
-
-    # Store complete recording
-    storage.store_recording(
-        request_hash, request_data, {"body": chunks, "is_streaming": True}
-    )
-
-    # Return generator that replays captured chunks
-    async def replay_stream():
-        for chunk in chunks:
-            yield chunk
-
-    return replay_stream()
-```
-
-This ensures:
- **Complete capture** - The entire stream is saved atomically
- **Interface preservation** - The returned object behaves like the original API
- **Deterministic replay** - Same chunks in the same order every time
-
-## Serialization
-
-API responses contain complex Pydantic objects that need careful serialization:
-
-```python
-def _serialize_response(response):
-    if hasattr(response, "model_dump"):
-        # Preserve type information for proper deserialization
-        return {
-            "__type__": f"{response.__class__.__module__}.{response.__class__.__qualname__}",
-            "__data__": response.model_dump(mode="json"),
-        }
-    return response
-```
-
-This preserves type safety - when replayed, you get the same Pydantic objects with all their validation and methods.
-
-## Environment Integration
-
-### Environment Variables
-
-Control recording behavior globally:
-
-```bash
-export LLAMA_STACK_TEST_INFERENCE_MODE=replay
-export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings
-pytest tests/integration/
-```
-
-### Pytest Integration
-
-The system integrates automatically based on environment variables, requiring no changes to test code.
-
-## Debugging Recordings
-
-### Inspecting Storage
-
-```bash
-# See what's recorded
-sqlite3 recordings/index.sqlite "SELECT endpoint, model, timestamp FROM recordings LIMIT 10;"
-
-# View specific response
-cat recordings/responses/abc123def456.json | jq '.response.body'
-
-# Find recordings by endpoint
-sqlite3 recordings/index.sqlite "SELECT * FROM recordings WHERE endpoint='/v1/chat/completions';"
-```
-
-### Common Issues
-
-**Hash mismatches:** Request parameters changed slightly between record and replay
-```bash
-# Compare request details
-cat recordings/responses/abc123.json | jq '.request'
-```
-
-**Serialization errors:** Response types changed between versions
-```bash
-# Re-record with updated types
-rm recordings/responses/failing_hash.json
-LLAMA_STACK_TEST_INFERENCE_MODE=record pytest test_failing.py
-```
-
-**Missing recordings:** New test or changed parameters
-```bash
-# Record the missing interaction
-LLAMA_STACK_TEST_INFERENCE_MODE=record pytest test_new.py
-```
-
-## Design Decisions
-
-### Why Not Mocks?
-
-Traditional mocking breaks down with AI APIs because:
- Response structures are complex and evolve frequently
- Streaming behavior is hard to mock correctly
- Edge cases in real APIs get missed
- Mocks become brittle maintenance burdens
-
-### Why Precise Hashing?
-
-Loose hashing (normalizing whitespace, rounding floats) seems convenient but hides bugs. If a test changes slightly, you want to know about it rather than accidentally getting the wrong cached response.
-
-### Why JSON + SQLite?
-
- **JSON** - Human readable, diff-friendly, easy to inspect and modify
- **SQLite** - Fast indexed lookups without loading response bodies
- **Hybrid** - Best of both worlds for different use cases
-
-This system provides reliable, fast testing against real AI APIs while maintaining the ability to debug issues when they arise.
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -53,31 +53,24 @@ The main points to consider are:

 ```
 llama stack build -h
-usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--distro DISTRIBUTION] [--list-distros] [--image-type {container,venv}] [--image-name IMAGE_NAME] [--print-deps-only]
-                         [--run] [--providers PROVIDERS]
+usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--list-templates] [--image-type {container,venv}] [--image-name IMAGE_NAME] [--print-deps-only] [--run]

 Build a Llama stack container

 options:
  -h, --help            show this help message and exit
-  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will be prompted to
-                        enter information interactively (default: None)
-  --template TEMPLATE   (deprecated) Name of the example template config to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default:
-                        None)
-  --distro DISTRIBUTION, --distribution DISTRIBUTION
-                        Name of the distribution to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default: None)
-  --list-distros, --list-distributions
-                        Show the available distributions for building a Llama Stack distribution (default: False)
+  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will
+                        be prompted to enter information interactively (default: None)
+  --template TEMPLATE   Name of the example template config to use for build. You may use `llama stack build --list-templates` to check out the available templates (default: None)
+  --list-templates      Show the available templates for building a Llama Stack distribution (default: False)
  --image-type {container,venv}
                        Image Type to use for the build. If not specified, will use the image type from the template config. (default: None)
  --image-name IMAGE_NAME
-                        [for image-type=container|venv] Name of the virtual environment to use for the build. If not specified, currently active environment will be used if found. (default:
-                        None)
+                        [for image-type=container|venv] Name of the virtual environment to use for the build. If not specified, currently active environment will be used if
+                        found. (default: None)
  --print-deps-only     Print the dependencies for the stack only, without building the stack (default: False)
  --run                 Run the stack after building using the same image type, name, and other applicable arguments (default: False)
-  --providers PROVIDERS
-                        Build a config for a list of providers and only those providers. This list is formatted like: api1=provider1,api2=provider2. Where there can be multiple providers per
-                        API. (default: None)
+
 ```

 After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.
--- a/docs/source/distributions/k8s-benchmark/README.md
+++ b/docs/source/distributions/k8s-benchmark/README.md
@ -1,156 +0,0 @@
-# Llama Stack Benchmark Suite on Kubernetes
-
-## Motivation
-
-Performance benchmarking is critical for understanding the overhead and characteristics of the Llama Stack abstraction layer compared to direct inference engines like vLLM.
-
-### Why This Benchmark Suite Exists
-
-**Performance Validation**: The Llama Stack provides a unified API layer across multiple inference providers, but this abstraction introduces potential overhead. This benchmark suite quantifies the performance impact by comparing:
- Llama Stack inference (with vLLM backend)
- Direct vLLM inference calls
- Both under identical Kubernetes deployment conditions
-
-**Production Readiness Assessment**: Real-world deployments require understanding performance characteristics under load. This suite simulates concurrent user scenarios with configurable parameters (duration, concurrency, request patterns) to validate production readiness.
-
-**Regression Detection (TODO)**: As the Llama Stack evolves, this benchmark provides automated regression detection for performance changes. CI/CD pipelines can leverage these benchmarks to catch performance degradations before production deployments.
-
-**Resource Planning**: By measuring throughput, latency percentiles, and resource utilization patterns, teams can make informed decisions about:
- Kubernetes resource allocation (CPU, memory, GPU)
- Auto-scaling configurations
- Cost optimization strategies
-
-### Key Metrics Captured
-
-The benchmark suite measures critical performance indicators:
- **Throughput**: Requests per second under sustained load
- **Latency Distribution**: P50, P95, P99 response times
- **Time to First Token (TTFT)**: Critical for streaming applications
- **Error Rates**: Request failures and timeout analysis
-
-This data enables data-driven architectural decisions and performance optimization efforts.
-
-## Setup
-
-**1. Deploy base k8s infrastructure:**
-```bash
-cd ../k8s
-./apply.sh
-```
-
-**2. Deploy benchmark components:**
-```bash
-cd ../k8s-benchmark
-./apply.sh
-```
-
-**3. Verify deployment:**
-```bash
-kubectl get pods
-# Should see: llama-stack-benchmark-server, vllm-server, etc.
-```
-
-## Quick Start
-
-### Basic Benchmarks
-
-**Benchmark Llama Stack (default):**
-```bash
-cd docs/source/distributions/k8s-benchmark/
-./run-benchmark.sh
-```
-
-**Benchmark vLLM direct:**
-```bash
-./run-benchmark.sh --target vllm
-```
-
-### Custom Configuration
-
-**Extended benchmark with high concurrency:**
-```bash
-./run-benchmark.sh --target vllm --duration 120 --concurrent 20
-```
-
-**Short test run:**
-```bash
-./run-benchmark.sh --target stack --duration 30 --concurrent 5
-```
-
-## Command Reference
-
-### run-benchmark.sh Options
-
-```bash
-./run-benchmark.sh [options]
-
-Options:
-  -t, --target <stack|vllm>     Target to benchmark (default: stack)
-  -d, --duration <seconds>      Duration in seconds (default: 60)
-  -c, --concurrent <users>      Number of concurrent users (default: 10)
-  -h, --help                    Show help message
-
-Examples:
-  ./run-benchmark.sh --target vllm              # Benchmark vLLM direct
-  ./run-benchmark.sh --target stack             # Benchmark Llama Stack
-  ./run-benchmark.sh -t vllm -d 120 -c 20       # vLLM with 120s, 20 users
-```
-
-## Local Testing
-
-### Running Benchmark Locally
-
-For local development without Kubernetes:
-
-**1. Start OpenAI mock server:**
-```bash
-uv run python openai-mock-server.py --port 8080
-```
-
-**2. Run benchmark against mock server:**
-```bash
-uv run python benchmark.py \
-  --base-url http://localhost:8080/v1 \
-  --model mock-inference \
-  --duration 30 \
-  --concurrent 5
-```
-
-**3. Test against local vLLM server:**
-```bash
-# If you have vLLM running locally on port 8000
-uv run python benchmark.py \
-  --base-url http://localhost:8000/v1 \
-  --model meta-llama/Llama-3.2-3B-Instruct \
-  --duration 30 \
-  --concurrent 5
-```
-
-**4. Profile the running server:**
-```bash
-./profile_running_server.sh
-```
-
-
-
-### OpenAI Mock Server
-
-The `openai-mock-server.py` provides:
- **OpenAI-compatible API** for testing without real models
- **Configurable streaming delay** via `STREAM_DELAY_SECONDS` env var
- **Consistent responses** for reproducible benchmarks
- **Lightweight testing** without GPU requirements
-
-**Mock server usage:**
-```bash
-uv run python openai-mock-server.py --port 8080
-```
-
-The mock server is also deployed in k8s as `openai-mock-service:8080` and can be used by changing the Llama Stack configuration to use the `mock-vllm-inference` provider.
-
-## Files in this Directory
-
- `benchmark.py` - Core benchmark script with async streaming support
- `run-benchmark.sh` - Main script with target selection and configuration
- `openai-mock-server.py` - Mock OpenAI API server for local testing
- `README.md` - This documentation file
--- a/docs/source/distributions/k8s-benchmark/apply.sh
+++ b/docs/source/distributions/k8s-benchmark/apply.sh
@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Deploys the benchmark-specific components on top of the base k8s deployment (../k8s/apply.sh).
-
-export STREAM_DELAY_SECONDS=0.005
-
-export POSTGRES_USER=llamastack
-export POSTGRES_DB=llamastack
-export POSTGRES_PASSWORD=llamastack
-
-export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
-export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-
-export MOCK_INFERENCE_MODEL=mock-inference
-
-export MOCK_INFERENCE_URL=openai-mock-service:8080
-
-export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
-
-set -euo pipefail
-set -x
-
-# Deploy benchmark-specific components
-kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
-  --dry-run=client -o yaml > stack-configmap.yaml
-
-kubectl apply --validate=false -f stack-configmap.yaml
-
-# Deploy our custom llama stack server (overriding the base one)
-envsubst < stack-k8s.yaml.template | kubectl apply --validate=false -f -
--- a/docs/source/distributions/k8s-benchmark/benchmark.py
+++ b/docs/source/distributions/k8s-benchmark/benchmark.py
@ -1,267 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""
-Simple benchmark script for Llama Stack with OpenAI API compatibility.
-"""
-
-import argparse
-import asyncio
-import os
-import random
-import statistics
-import time
-from typing import Tuple
-import aiohttp
-
-
-class BenchmarkStats:
-    def __init__(self):
-        self.response_times = []
-        self.ttft_times = []
-        self.chunks_received = []
-        self.errors = []
-        self.success_count = 0
-        self.total_requests = 0
-        self.concurrent_users = 0
-        self.start_time = None
-        self.end_time = None
-        self._lock = asyncio.Lock()
-
-    async def add_result(self, response_time: float, chunks: int, ttft: float = None, error: str = None):
-        async with self._lock:
-            self.total_requests += 1
-            if error:
-                self.errors.append(error)
-            else:
-                self.success_count += 1
-                self.response_times.append(response_time)
-                self.chunks_received.append(chunks)
-                if ttft is not None:
-                    self.ttft_times.append(ttft)
-
-    def print_summary(self):
-        if not self.response_times:
-            print("No successful requests to report")
-            if self.errors:
-                print(f"Total errors: {len(self.errors)}")
-                print("First 5 errors:")
-                for error in self.errors[:5]:
-                    print(f"  {error}")
-            return
-
-        total_time = self.end_time - self.start_time
-        success_rate = (self.success_count / self.total_requests) * 100
-        
-        print(f"\n{'='*60}")
-        print(f"BENCHMARK RESULTS")
-        print(f"{'='*60}")
-        print(f"Total time: {total_time:.2f}s")
-        print(f"Concurrent users: {self.concurrent_users}")
-        print(f"Total requests: {self.total_requests}")
-        print(f"Successful requests: {self.success_count}")
-        print(f"Failed requests: {len(self.errors)}")
-        print(f"Success rate: {success_rate:.1f}%")
-        print(f"Requests per second: {self.success_count / total_time:.2f}")
-        
-        print(f"\nResponse Time Statistics:")
-        print(f"  Mean: {statistics.mean(self.response_times):.3f}s")
-        print(f"  Median: {statistics.median(self.response_times):.3f}s")
-        print(f"  Min: {min(self.response_times):.3f}s")
-        print(f"  Max: {max(self.response_times):.3f}s")
-        
-        if len(self.response_times) > 1:
-            print(f"  Std Dev: {statistics.stdev(self.response_times):.3f}s")
-            
-        percentiles = [50, 90, 95, 99]
-        sorted_times = sorted(self.response_times)
-        print(f"\nPercentiles:")
-        for p in percentiles:
-            idx = int(len(sorted_times) * p / 100) - 1
-            idx = max(0, min(idx, len(sorted_times) - 1))
-            print(f"  P{p}: {sorted_times[idx]:.3f}s")
-            
-        if self.ttft_times:
-            print(f"\nTime to First Token (TTFT) Statistics:")
-            print(f"  Mean: {statistics.mean(self.ttft_times):.3f}s")
-            print(f"  Median: {statistics.median(self.ttft_times):.3f}s")
-            print(f"  Min: {min(self.ttft_times):.3f}s")
-            print(f"  Max: {max(self.ttft_times):.3f}s")
-            
-            if len(self.ttft_times) > 1:
-                print(f"  Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
-                
-            sorted_ttft = sorted(self.ttft_times)
-            print(f"\nTTFT Percentiles:")
-            for p in percentiles:
-                idx = int(len(sorted_ttft) * p / 100) - 1
-                idx = max(0, min(idx, len(sorted_ttft) - 1))
-                print(f"  P{p}: {sorted_ttft[idx]:.3f}s")
-            
-        if self.chunks_received:
-            print(f"\nStreaming Statistics:")
-            print(f"  Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
-            print(f"  Total chunks received: {sum(self.chunks_received)}")
-        
-        if self.errors:
-            print(f"\nErrors (showing first 5):")
-            for error in self.errors[:5]:
-                print(f"  {error}")
-
-
-class LlamaStackBenchmark:
-    def __init__(self, base_url: str, model_id: str):
-        self.base_url = base_url.rstrip('/')
-        self.model_id = model_id
-        self.headers = {"Content-Type": "application/json"}
-        self.test_messages = [
-            [{"role": "user", "content": "Hi"}],
-            [{"role": "user", "content": "What is the capital of France?"}],
-            [{"role": "user", "content": "Explain quantum physics in simple terms."}],
-            [{"role": "user", "content": "Write a short story about a robot learning to paint."}],
-            [
-                {"role": "user", "content": "What is machine learning?"},
-                {"role": "assistant", "content": "Machine learning is a subset of AI..."},
-                {"role": "user", "content": "Can you give me a practical example?"}
-            ]
-        ]
-
-
-    async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
-        """Make a single async streaming chat completion request."""
-        messages = random.choice(self.test_messages)
-        payload = {
-            "model": self.model_id,
-            "messages": messages,
-            "stream": True,
-            "max_tokens": 100
-        }
-        
-        start_time = time.time()
-        chunks_received = 0
-        ttft = None
-        error = None
-        
-        session = aiohttp.ClientSession()
-        
-        try:
-            async with session.post(
-                f"{self.base_url}/chat/completions",
-                headers=self.headers,
-                json=payload,
-                timeout=aiohttp.ClientTimeout(total=30)
-            ) as response:
-                if response.status == 200:
-                    async for line in response.content:
-                        if line:
-                            line_str = line.decode('utf-8').strip()
-                            if line_str.startswith('data: '):
-                                chunks_received += 1
-                                if ttft is None:
-                                    ttft = time.time() - start_time
-                                if line_str == 'data: [DONE]':
-                                    break
-                    
-                    if chunks_received == 0:
-                        error = "No streaming chunks received"
-                else:
-                    text = await response.text()
-                    error = f"HTTP {response.status}: {text[:100]}"
-                    
-        except Exception as e:
-            error = f"Request error: {str(e)}"
-        finally:
-            await session.close()
-            
-        response_time = time.time() - start_time
-        return response_time, chunks_received, ttft, error
-
-
-    async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
-        """Run benchmark using async requests for specified duration."""
-        stats = BenchmarkStats()
-        stats.concurrent_users = concurrent_users
-        stats.start_time = time.time()
-        
-        print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users")
-        print(f"Target URL: {self.base_url}/chat/completions")
-        print(f"Model: {self.model_id}")
-        
-        connector = aiohttp.TCPConnector(limit=concurrent_users)
-        async with aiohttp.ClientSession(connector=connector) as session:
-            
-            async def worker(worker_id: int):
-                """Worker that sends requests sequentially until canceled."""
-                request_count = 0
-                while True:
-                    try:
-                        response_time, chunks, ttft, error = await self.make_async_streaming_request()
-                        await stats.add_result(response_time, chunks, ttft, error)
-                        request_count += 1
-                        
-                    except asyncio.CancelledError:
-                        break
-                    except Exception as e:
-                        await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}")
-            
-            # Progress reporting task
-            async def progress_reporter():
-                last_report_time = time.time()
-                while True:
-                    try:
-                        await asyncio.sleep(1)  # Report every second
-                        if time.time() >= last_report_time + 10:  # Report every 10 seconds
-                            elapsed = time.time() - stats.start_time
-                            print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s")
-                            last_report_time = time.time()
-                    except asyncio.CancelledError:
-                        break
-            
-            # Spawn concurrent workers
-            tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)]
-            progress_task = asyncio.create_task(progress_reporter())
-            tasks.append(progress_task)
-            
-            # Wait for duration then cancel all tasks
-            await asyncio.sleep(duration)
-            
-            for task in tasks:
-                task.cancel()
-            
-            # Wait for all tasks to complete
-            await asyncio.gather(*tasks, return_exceptions=True)
-        
-        stats.end_time = time.time()
-        return stats
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
-    parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
-                       help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)")
-    parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"),
-                       help="Model ID to use for requests")
-    parser.add_argument("--duration", type=int, default=60,
-                       help="Duration in seconds to run benchmark (default: 60)")
-    parser.add_argument("--concurrent", type=int, default=10,
-                       help="Number of concurrent users (default: 10)")
-    
-    args = parser.parse_args()
-    
-    benchmark = LlamaStackBenchmark(args.base_url, args.model)
-    
-    try:
-        stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent))
-        stats.print_summary()
-        
-    except KeyboardInterrupt:
-        print("\nBenchmark interrupted by user")
-    except Exception as e:
-        print(f"Benchmark failed: {e}")
-
-
-if __name__ == "__main__":
-    main()
--- a/docs/source/distributions/k8s-benchmark/openai-mock-server.py
+++ b/docs/source/distributions/k8s-benchmark/openai-mock-server.py
@ -1,190 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""
-OpenAI-compatible mock server that returns:
- Hardcoded /models response for consistent validation
- Valid OpenAI-formatted chat completion responses with dynamic content
-"""
-
-from flask import Flask, request, jsonify, Response
-import time
-import random
-import uuid
-import json
-import argparse
-import os
-
-app = Flask(__name__)
-
-# Models from environment variables
-def get_models():
-    models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
-    model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
-    
-    return {
-        "object": "list",
-        "data": [
-            {
-                "id": model_id,
-                "object": "model",
-                "created": 1234567890,
-                "owned_by": "vllm"
-            }
-            for model_id in model_ids
-        ]
-    }
-
-def generate_random_text(length=50):
-    """Generate random but coherent text for responses."""
-    words = [
-        "Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you",
-        "with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what",
-        "you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist",
-        "with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more"
-    ]
-    return " ".join(random.choices(words, k=length))
-
-@app.route('/v1/models', methods=['GET'])
-def list_models():
-    models = get_models()
-    print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
-    return jsonify(models)
-
-@app.route('/v1/chat/completions', methods=['POST'])
-def chat_completions():
-    """Return OpenAI-formatted chat completion responses."""
-    data = request.get_json()
-    default_model = get_models()['data'][0]['id']
-    model = data.get('model', default_model)
-    messages = data.get('messages', [])
-    stream = data.get('stream', False)
-     
-    print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
-    
-    if stream:
-        return handle_streaming_completion(model, messages)
-    else:
-        return handle_non_streaming_completion(model, messages)
-
-def handle_non_streaming_completion(model, messages):
-    response_text = generate_random_text(random.randint(20, 80))
-    
-    # Calculate realistic token counts
-    prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages)
-    completion_tokens = len(response_text.split())
-    
-    response = {
-        "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
-        "object": "chat.completion",
-        "created": int(time.time()),
-        "model": model,
-        "choices": [
-            {
-                "index": 0,
-                "message": {
-                    "role": "assistant",
-                    "content": response_text
-                },
-                "finish_reason": "stop"
-            }
-        ],
-        "usage": {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens
-        }
-    }
-    
-    return jsonify(response)
-
-def handle_streaming_completion(model, messages):
-    def generate_stream():
-        # Generate response text
-        full_response = generate_random_text(random.randint(30, 100))
-        words = full_response.split()
-        
-        # Send initial chunk
-        initial_chunk = {
-            "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
-            "object": "chat.completion.chunk",
-            "created": int(time.time()),
-            "model": model,
-            "choices": [
-                {
-                    "index": 0,
-                    "delta": {"role": "assistant", "content": ""}
-                }
-            ]
-        }
-        yield f"data: {json.dumps(initial_chunk)}\n\n"
-        
-        # Send word by word
-        for i, word in enumerate(words):
-            chunk = {
-                "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
-                "object": "chat.completion.chunk", 
-                "created": int(time.time()),
-                "model": model,
-                "choices": [
-                    {
-                        "index": 0,
-                        "delta": {"content": f"{word} " if i < len(words) - 1 else word}
-                    }
-                ]
-            }
-            yield f"data: {json.dumps(chunk)}\n\n"
-            # Configurable delay to simulate realistic streaming
-            stream_delay = float(os.getenv("STREAM_DELAY_SECONDS", "0.005"))
-            time.sleep(stream_delay)
-        
-        # Send final chunk
-        final_chunk = {
-            "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
-            "object": "chat.completion.chunk",
-            "created": int(time.time()),
-            "model": model,
-            "choices": [
-                {
-                    "index": 0,
-                    "delta": {"content": ""},
-                    "finish_reason": "stop"
-                }
-            ]
-        }
-        yield f"data: {json.dumps(final_chunk)}\n\n"
-        yield "data: [DONE]\n\n"
-    
-    return Response(
-        generate_stream(),
-        mimetype='text/event-stream',
-        headers={
-            'Cache-Control': 'no-cache',
-            'Connection': 'keep-alive',
-            'Access-Control-Allow-Origin': '*',
-        }
-    )
-
-@app.route('/health', methods=['GET'])
-def health():
-    return jsonify({"status": "healthy", "type": "openai-mock"})
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='OpenAI-compatible mock server')
-    parser.add_argument('--port', type=int, default=8081, 
-                       help='Port to run the server on (default: 8081)')
-    args = parser.parse_args()
-    
-    port = args.port
-    
-    models = get_models()
-    print("Starting OpenAI-compatible mock server...")
-    print(f"- /models endpoint with: {[m['id'] for m in models['data']]}")
-    print("- OpenAI-formatted chat/completion responses with dynamic content")
-    print("- Streaming support with valid SSE format")
-    print(f"- Listening on: http://0.0.0.0:{port}")
-    app.run(host='0.0.0.0', port=port, debug=False)
--- a/docs/source/distributions/k8s-benchmark/profile_running_server.sh
+++ b/docs/source/distributions/k8s-benchmark/profile_running_server.sh
@ -1,52 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Script to profile an already running Llama Stack server
-# Usage: ./profile_running_server.sh [duration_seconds] [output_file]
-
-DURATION=${1:-60}  # Default 60 seconds
-OUTPUT_FILE=${2:-"llama_stack_profile"}  # Default output file
-
-echo "Looking for running Llama Stack server..."
-
-# Find the server PID
-SERVER_PID=$(ps aux | grep "llama_stack.core.server.server" | grep -v grep | awk '{print $2}' | head -1)
-
-
-if [ -z "$SERVER_PID" ]; then
-    echo "Error: No running Llama Stack server found"
-    echo "Please start your server first with:"
-    echo "LLAMA_STACK_LOGGING=\"all=ERROR\" MOCK_INFERENCE_URL=http://localhost:8080 SAFETY_MODEL=llama-guard3:1b uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml"
-    exit 1
-fi
-
-echo "Found Llama Stack server with PID: $SERVER_PID"
-
-# Start py-spy profiling
-echo "Starting py-spy profiling for ${DURATION} seconds..."
-echo "Output will be saved to: ${OUTPUT_FILE}.svg"
-echo ""
-echo "You can now run your load test..."
-echo ""
-
-# Get the full path to py-spy
-PYSPY_PATH=$(which py-spy)
-
-# Check if running as root, if not, use sudo
-if [ "$EUID" -ne 0 ]; then
-    echo "py-spy requires root permissions on macOS. Running with sudo..."
-    sudo "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
-else
-    "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
-fi
-
-echo ""
-echo "Profiling completed! Results saved to: ${OUTPUT_FILE}.svg"
-echo ""
-echo "To view the flame graph:"
-echo "open ${OUTPUT_FILE}.svg"
--- a/docs/source/distributions/k8s-benchmark/run-benchmark.sh
+++ b/docs/source/distributions/k8s-benchmark/run-benchmark.sh
@ -1,148 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-set -euo pipefail
-
-# Default values
-TARGET="stack"
-DURATION=60
-CONCURRENT=10
-
-# Parse command line arguments
-usage() {
-    echo "Usage: $0 [options]"
-    echo "Options:"
-    echo "  -t, --target <stack|vllm>     Target to benchmark (default: stack)"
-    echo "  -d, --duration <seconds>      Duration in seconds (default: 60)"
-    echo "  -c, --concurrent <users>      Number of concurrent users (default: 10)"
-    echo "  -h, --help                    Show this help message"
-    echo ""
-    echo "Examples:"
-    echo "  $0 --target vllm              # Benchmark vLLM direct"
-    echo "  $0 --target stack             # Benchmark Llama Stack (default)"
-    echo "  $0 -t vllm -d 120 -c 20       # vLLM with 120s duration, 20 users"
-}
-
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        -t|--target)
-            TARGET="$2"
-            shift 2
-            ;;
-        -d|--duration)
-            DURATION="$2"
-            shift 2
-            ;;
-        -c|--concurrent)
-            CONCURRENT="$2"
-            shift 2
-            ;;
-        -h|--help)
-            usage
-            exit 0
-            ;;
-        *)
-            echo "Unknown option: $1"
-            usage
-            exit 1
-            ;;
-    esac
-done
-
-# Validate target
-if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
-    echo "Error: Target must be 'stack' or 'vllm'"
-    usage
-    exit 1
-fi
-
-# Set configuration based on target
-if [[ "$TARGET" == "vllm" ]]; then
-    BASE_URL="http://vllm-server:8000/v1"
-    JOB_NAME="vllm-benchmark-job"
-    echo "Benchmarking vLLM direct..."
-else
-    BASE_URL="http://llama-stack-benchmark-service:8323/v1/openai/v1"
-    JOB_NAME="stack-benchmark-job"
-    echo "Benchmarking Llama Stack..."
-fi
-
-echo "Configuration:"
-echo "  Target: $TARGET"
-echo "  Base URL: $BASE_URL"
-echo "  Duration: ${DURATION}s"
-echo "  Concurrent users: $CONCURRENT"
-echo ""
-
-# Create temporary job yaml
-TEMP_YAML="/tmp/benchmark-job-temp-$(date +%s).yaml"
-cat > "$TEMP_YAML" << EOF
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: $JOB_NAME
-  namespace: default
-spec:
-  template:
-    spec:
-      containers:
-      - name: benchmark
-        image: python:3.11-slim
-        command: ["/bin/bash"]
-        args:
-        - "-c"
-        - |
-          pip install aiohttp &&
-          python3 /benchmark/benchmark.py \\
-            --base-url $BASE_URL \\
-            --model \${INFERENCE_MODEL} \\
-            --duration $DURATION \\
-            --concurrent $CONCURRENT
-        env:
-        - name: INFERENCE_MODEL
-          value: "meta-llama/Llama-3.2-3B-Instruct"
-        volumeMounts:
-        - name: benchmark-script
-          mountPath: /benchmark
-        resources:
-          requests:
-            memory: "256Mi"
-            cpu: "250m"
-          limits:
-            memory: "512Mi"
-            cpu: "500m"
-      volumes:
-      - name: benchmark-script
-        configMap:
-          name: benchmark-script
-      restartPolicy: Never
-  backoffLimit: 3
-EOF
-
-echo "Creating benchmark ConfigMap..."
-kubectl create configmap benchmark-script \
-  --from-file=benchmark.py=benchmark.py \
-  --dry-run=client -o yaml | kubectl apply -f -
-
-echo "Cleaning up any existing benchmark job..."
-kubectl delete job $JOB_NAME 2>/dev/null || true
-
-echo "Deploying benchmark Job..."
-kubectl apply -f "$TEMP_YAML"
-
-echo "Waiting for job to start..."
-kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=60s
-
-echo "Following benchmark logs..."
-kubectl logs -f job/$JOB_NAME
-
-echo "Job completed. Checking final status..."
-kubectl get job $JOB_NAME
-
-# Clean up temporary file
-rm -f "$TEMP_YAML"
--- a/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
@ -1,133 +0,0 @@
-apiVersion: v1
-data:
-  stack_run_config.yaml: |
-    version: '2'
-    image_name: kubernetes-benchmark-demo
-    apis:
-    - agents
-    - inference
-    - safety
-    - telemetry
-    - tool_runtime
-    - vector_io
-    providers:
-      inference:
-      - provider_id: vllm-inference
-        provider_type: remote::vllm
-        config:
-          url: ${env.VLLM_URL:=http://localhost:8000/v1}
-          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-          api_token: ${env.VLLM_API_TOKEN:=fake}
-          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-      - provider_id: vllm-safety
-        provider_type: remote::vllm
-        config:
-          url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
-          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-          api_token: ${env.VLLM_API_TOKEN:=fake}
-          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-      - provider_id: sentence-transformers
-        provider_type: inline::sentence-transformers
-        config: {}
-      vector_io:
-      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
-        provider_type: remote::chromadb
-        config:
-          url: ${env.CHROMADB_URL:=}
-          kvstore:
-            type: postgres
-            host: ${env.POSTGRES_HOST:=localhost}
-            port: ${env.POSTGRES_PORT:=5432}
-            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:=llamastack}
-            password: ${env.POSTGRES_PASSWORD:=llamastack}
-      safety:
-      - provider_id: llama-guard
-        provider_type: inline::llama-guard
-        config:
-          excluded_categories: []
-      agents:
-      - provider_id: meta-reference
-        provider_type: inline::meta-reference
-        config:
-          persistence_store:
-            type: postgres
-            host: ${env.POSTGRES_HOST:=localhost}
-            port: ${env.POSTGRES_PORT:=5432}
-            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:=llamastack}
-            password: ${env.POSTGRES_PASSWORD:=llamastack}
-          responses_store:
-            type: postgres
-            host: ${env.POSTGRES_HOST:=localhost}
-            port: ${env.POSTGRES_PORT:=5432}
-            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:=llamastack}
-            password: ${env.POSTGRES_PASSWORD:=llamastack}
-      telemetry:
-      - provider_id: meta-reference
-        provider_type: inline::meta-reference
-        config:
-          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-          sinks: ${env.TELEMETRY_SINKS:=console}
-      tool_runtime:
-      - provider_id: brave-search
-        provider_type: remote::brave-search
-        config:
-          api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-          max_results: 3
-      - provider_id: tavily-search
-        provider_type: remote::tavily-search
-        config:
-          api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-          max_results: 3
-      - provider_id: rag-runtime
-        provider_type: inline::rag-runtime
-        config: {}
-      - provider_id: model-context-protocol
-        provider_type: remote::model-context-protocol
-        config: {}
-    metadata_store:
-      type: postgres
-      host: ${env.POSTGRES_HOST:=localhost}
-      port: ${env.POSTGRES_PORT:=5432}
-      db: ${env.POSTGRES_DB:=llamastack}
-      user: ${env.POSTGRES_USER:=llamastack}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
-      table_name: llamastack_kvstore
-    inference_store:
-      type: postgres
-      host: ${env.POSTGRES_HOST:=localhost}
-      port: ${env.POSTGRES_PORT:=5432}
-      db: ${env.POSTGRES_DB:=llamastack}
-      user: ${env.POSTGRES_USER:=llamastack}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
-    models:
-    - metadata:
-        embedding_dimension: 384
-      model_id: all-MiniLM-L6-v2
-      provider_id: sentence-transformers
-      model_type: embedding
-    - model_id: ${env.INFERENCE_MODEL}
-      provider_id: vllm-inference
-      model_type: llm
-    - model_id: ${env.SAFETY_MODEL}
-      provider_id: vllm-safety
-      model_type: llm
-    shields:
-    - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-    vector_dbs: []
-    datasets: []
-    scoring_fns: []
-    benchmarks: []
-    tool_groups:
-    - toolgroup_id: builtin::websearch
-      provider_id: tavily-search
-    - toolgroup_id: builtin::rag
-      provider_id: rag-runtime
-    server:
-      port: 8323
-kind: ConfigMap
-metadata:
-  creationTimestamp: null
-  name: llama-stack-config
--- a/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template
@ -1,83 +0,0 @@
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: llama-benchmark-pvc
-spec:
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 1Gi
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llama-stack-benchmark-server
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: llama-stack-benchmark
-      app.kubernetes.io/component: server
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: llama-stack-benchmark
-        app.kubernetes.io/component: server
-    spec:
-      containers:
-      - name: llama-stack-benchmark
-        image: llamastack/distribution-starter:latest
-        imagePullPolicy: Always # since we have specified latest instead of a version
-        env:
-        - name: ENABLE_CHROMADB
-          value: "true"
-        - name: CHROMADB_URL
-          value: http://chromadb.default.svc.cluster.local:6000
-        - name: POSTGRES_HOST
-          value: postgres-server.default.svc.cluster.local
-        - name: POSTGRES_PORT
-          value: "5432"
-        - name: INFERENCE_MODEL
-          value: "${INFERENCE_MODEL}"
-        - name: SAFETY_MODEL
-          value: "${SAFETY_MODEL}"
-        - name: TAVILY_SEARCH_API_KEY
-          value: "${TAVILY_SEARCH_API_KEY}"
-        - name: VLLM_URL
-          value: http://vllm-server.default.svc.cluster.local:8000/v1
-        - name: VLLM_MAX_TOKENS
-          value: "3072"
-        - name: VLLM_SAFETY_URL
-          value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
-        - name: VLLM_TLS_VERIFY
-          value: "false"
-        command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
-        ports:
-          - containerPort: 8323
-        volumeMounts:
-          - name: llama-storage
-            mountPath: /root/.llama
-          - name: llama-config
-            mountPath: /etc/config
-      volumes:
-      - name: llama-storage
-        persistentVolumeClaim:
-          claimName: llama-benchmark-pvc
-      - name: llama-config
-        configMap:
-          name: llama-stack-config
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: llama-stack-benchmark-service
-spec:
-  selector:
-    app.kubernetes.io/name: llama-stack-benchmark
-    app.kubernetes.io/component: server
-  ports:
-  - name: http
-    port: 8323
-    targetPort: 8323
-  type: ClusterIP
--- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
@ -1,108 +0,0 @@
-version: '2'
-image_name: kubernetes-benchmark-demo
-apis:
- agents
- inference
- telemetry
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: vllm-inference
-    provider_type: remote::vllm
-    config:
-      url: ${env.VLLM_URL:=http://localhost:8000/v1}
-      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-      api_token: ${env.VLLM_API_TOKEN:=fake}
-      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  vector_io:
-  - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
-    provider_type: remote::chromadb
-    config:
-      url: ${env.CHROMADB_URL:=}
-      kvstore:
-        type: postgres
-        host: ${env.POSTGRES_HOST:=localhost}
-        port: ${env.POSTGRES_PORT:=5432}
-        db: ${env.POSTGRES_DB:=llamastack}
-        user: ${env.POSTGRES_USER:=llamastack}
-        password: ${env.POSTGRES_PASSWORD:=llamastack}
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: postgres
-        host: ${env.POSTGRES_HOST:=localhost}
-        port: ${env.POSTGRES_PORT:=5432}
-        db: ${env.POSTGRES_DB:=llamastack}
-        user: ${env.POSTGRES_USER:=llamastack}
-        password: ${env.POSTGRES_PASSWORD:=llamastack}
-      responses_store:
-        type: postgres
-        host: ${env.POSTGRES_HOST:=localhost}
-        port: ${env.POSTGRES_PORT:=5432}
-        db: ${env.POSTGRES_DB:=llamastack}
-        user: ${env.POSTGRES_USER:=llamastack}
-        password: ${env.POSTGRES_PASSWORD:=llamastack}
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-metadata_store:
-  type: postgres
-  host: ${env.POSTGRES_HOST:=localhost}
-  port: ${env.POSTGRES_PORT:=5432}
-  db: ${env.POSTGRES_DB:=llamastack}
-  user: ${env.POSTGRES_USER:=llamastack}
-  password: ${env.POSTGRES_PASSWORD:=llamastack}
-  table_name: llamastack_kvstore
-inference_store:
-  type: postgres
-  host: ${env.POSTGRES_HOST:=localhost}
-  port: ${env.POSTGRES_PORT:=5432}
-  db: ${env.POSTGRES_DB:=llamastack}
-  user: ${env.POSTGRES_USER:=llamastack}
-  password: ${env.POSTGRES_PASSWORD:=llamastack}
-models:
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
- model_id: ${env.INFERENCE_MODEL}
-  provider_id: vllm-inference
-  model_type: llm
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8323
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@ -40,19 +40,19 @@ spec:
          value: "3072"
        - name: VLLM_SAFETY_URL
          value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
-        - name: VLLM_TLS_VERIFY
-          value: "false"
        - name: POSTGRES_HOST
          value: postgres-server.default.svc.cluster.local
        - name: POSTGRES_PORT
          value: "5432"
+        - name: VLLM_TLS_VERIFY
+          value: "false"
        - name: INFERENCE_MODEL
          value: "${INFERENCE_MODEL}"
        - name: SAFETY_MODEL
          value: "${SAFETY_MODEL}"
        - name: TAVILY_SEARCH_API_KEY
          value: "${TAVILY_SEARCH_API_KEY}"
-        command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8321"]
+        command: ["python", "-m", "llama_stack.core.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"]
        ports:
          - containerPort: 8321
        volumeMounts:
--- a/docs/source/distributions/ondevice_distro/android_sdk.md
+++ b/docs/source/distributions/ondevice_distro/android_sdk.md
@ -56,12 +56,12 @@ Breaking down the demo app, this section will show the core pieces that are used
 ### Setup Remote Inferencing
 Start a Llama Stack server on localhost. Here is an example of how you can do this using the firework.ai distribution:
 ```
-uv venv starter --python 3.12
-source starter/bin/activate  # On Windows: starter\Scripts\activate
+python -m venv stack-fireworks
+source stack-fireworks/bin/activate  # On Windows: stack-fireworks\Scripts\activate
 pip install --no-cache llama-stack==0.2.2
-llama stack build --distro starter --image-type venv
+llama stack build --distro fireworks --image-type venv
 export FIREWORKS_API_KEY=<SOME_KEY>
-llama stack run starter --port 5050
+llama stack run fireworks --port 5050
 ```

 Ensure the Llama Stack server version is the same as the Kotlin SDK Library for maximum compatibility.
--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@ -157,7 +157,7 @@ docker run \
 If you've set up your local development environment, you can also build the image using your local virtual environment.

 ```bash
-INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
+INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
 llama stack build --distro nvidia --image-type venv
 llama stack run ./run.yaml \
  --port 8321 \
--- a/docs/source/getting_started/demo_script.py
+++ b/docs/source/getting_started/demo_script.py
@ -52,16 +52,11 @@ agent = Agent(
 prompt = "How do you do great work?"
 print("prompt>", prompt)

-use_stream = True
 response = agent.create_turn(
    messages=[{"role": "user", "content": prompt}],
    session_id=agent.create_session("rag_session"),
-    stream=use_stream,
+    stream=True,
 )

-# Only call `AgentEventLogger().log(response)` for streaming responses.
-if use_stream:
-    for log in AgentEventLogger().log(response):
-        log.print()
-else:
-    print(response)
+for log in AgentEventLogger().log(response):
+    log.print()
--- a/docs/source/getting_started/detailed_tutorial.md
+++ b/docs/source/getting_started/detailed_tutorial.md
@ -150,7 +150,13 @@ pip install llama-stack-client
 ```
 :::

-
+:::{tab-item} Install with `venv`
+```bash
+python -m venv stack-client
+source stack-client/bin/activate  # On Windows: stack-client\Scripts\activate
+pip install llama-stack-client
+```
+:::
 ::::

 Now let's use the `llama-stack-client` [CLI](../references/llama_stack_client_cli_reference.md) to check the
--- a/docs/source/providers/agents/index.md
+++ b/docs/source/providers/agents/index.md
@ -1,16 +1,7 @@
-# Agents
+# Agents 

 ## Overview

-Agents API for creating and interacting with agentic systems.
-
-    Main functionalities provided by this API:
-    - Create agents with specific instructions and ability to use tools.
-    - Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn".
-    - Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
-    - Agents can be provided with various shields (see the Safety API for more details).
-    - Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
-
 This section contains documentation for all available providers for the **agents** API.

 ## Providers
--- a/docs/source/providers/batches/index.md
+++ b/docs/source/providers/batches/index.md
@ -1,21 +0,0 @@
-# Batches
-
-## Overview
-
-Protocol for batch processing API operations.
-
-    The Batches API enables efficient processing of multiple requests in a single operation,
-    particularly useful for processing large datasets, batch evaluation workflows, and
-    cost-effective inference at scale.
-
-    Note: This API is currently under active development and may undergo changes.
-
-This section contains documentation for all available providers for the **batches** API.
-
-## Providers
-
-```{toctree}
-:maxdepth: 1
-
-inline_reference
-```
--- a/docs/source/providers/batches/inline_reference.md
+++ b/docs/source/providers/batches/inline_reference.md
@ -1,23 +0,0 @@
-# inline::reference
-
-## Description
-
-Reference implementation of batches API with KVStore persistence.
-
-## Configuration
-
-| Field | Type | Required | Default | Description |
-|-------|------|----------|---------|-------------|
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Configuration for the key-value store backend. |
-| `max_concurrent_batches` | `<class 'int'>` | No | 1 | Maximum number of concurrent batches to process simultaneously. |
-| `max_concurrent_requests_per_batch` | `<class 'int'>` | No | 10 | Maximum number of concurrent requests to process per batch. |
-
-## Sample Configuration
-
-```yaml
-kvstore:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/batches.db
-
-```
-
--- a/docs/source/providers/datasetio/index.md
+++ b/docs/source/providers/datasetio/index.md
@ -1,4 +1,4 @@
-# Datasetio
+# Datasetio 

 ## Overview

--- a/docs/source/providers/eval/index.md
+++ b/docs/source/providers/eval/index.md
@ -1,9 +1,7 @@
-# Eval
+# Eval 

 ## Overview

-Llama Stack Evaluation API for running evaluations on model and agent candidates.
-
 This section contains documentation for all available providers for the **eval** API.

 ## Providers
--- a/docs/source/providers/external/external-providers-guide.md
+++ b/docs/source/providers/external/external-providers-guide.md
@ -226,7 +226,7 @@ uv init
 name = "llama-stack-provider-ollama"
 version = "0.1.0"
 description = "Ollama provider for Llama Stack"
-requires-python = ">=3.12"
+requires-python = ">=3.10"
 dependencies = ["llama-stack", "pydantic", "ollama", "aiohttp"]
 ```

--- a/docs/source/providers/files/index.md
+++ b/docs/source/providers/files/index.md
@ -1,4 +1,4 @@
-# Files
+# Files 

 ## Overview

--- a/docs/source/providers/files/inline_localfs.md
+++ b/docs/source/providers/files/inline_localfs.md
@ -8,7 +8,7 @@ Local filesystem-based file storage provider for managing files and documents lo

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `storage_dir` | `<class 'str'>` | No |  | Directory to store uploaded files |
+| `storage_dir` | `<class 'str'>` | No | PydanticUndefined | Directory to store uploaded files |
 | `metadata_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | SQL store configuration for file metadata |
 | `ttl_secs` | `<class 'int'>` | No | 31536000 |  |

--- a/docs/source/providers/inference/index.md
+++ b/docs/source/providers/inference/index.md
@ -1,13 +1,7 @@
-# Inference
+# Inference 

 ## Overview

-Llama Stack Inference API for generating completions, chat completions, and embeddings.
-
-    This API provides the raw interface to the underlying models. Two kinds of models are supported:
-    - LLM models: these models generate "raw" and "chat" (conversational) completions.
-    - Embedding models: these models generate embeddings to be used for semantic search.
-
 This section contains documentation for all available providers for the **inference** API.

 ## Providers
@ -35,7 +29,6 @@ remote_runpod
 remote_sambanova
 remote_tgi
 remote_together
-remote_vertexai
 remote_vllm
 remote_watsonx
 ```
--- a/docs/source/providers/inference/remote_hf_endpoint.md
+++ b/docs/source/providers/inference/remote_hf_endpoint.md
@ -8,7 +8,7 @@ HuggingFace Inference Endpoints provider for dedicated model serving.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `endpoint_name` | `<class 'str'>` | No |  | The name of the Hugging Face Inference Endpoint in the format of '{namespace}/{endpoint_name}' (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided. |
+| `endpoint_name` | `<class 'str'>` | No | PydanticUndefined | The name of the Hugging Face Inference Endpoint in the format of '{namespace}/{endpoint_name}' (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided. |
 | `api_token` | `pydantic.types.SecretStr \| None` | No |  | Your Hugging Face user access token (will default to locally saved token if not provided) |

 ## Sample Configuration
--- a/docs/source/providers/inference/remote_hf_serverless.md
+++ b/docs/source/providers/inference/remote_hf_serverless.md
@ -8,7 +8,7 @@ HuggingFace Inference API serverless provider for on-demand model inference.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `huggingface_repo` | `<class 'str'>` | No |  | The model ID of the model on the Hugging Face Hub (e.g. 'meta-llama/Meta-Llama-3.1-70B-Instruct') |
+| `huggingface_repo` | `<class 'str'>` | No | PydanticUndefined | The model ID of the model on the Hugging Face Hub (e.g. 'meta-llama/Meta-Llama-3.1-70B-Instruct') |
 | `api_token` | `pydantic.types.SecretStr \| None` | No |  | Your Hugging Face user access token (will default to locally saved token if not provided) |

 ## Sample Configuration
--- a/docs/source/providers/inference/remote_tgi.md
+++ b/docs/source/providers/inference/remote_tgi.md
@ -8,7 +8,7 @@ Text Generation Inference (TGI) provider for HuggingFace model serving.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `url` | `<class 'str'>` | No |  | The URL for the TGI serving endpoint |
+| `url` | `<class 'str'>` | No | PydanticUndefined | The URL for the TGI serving endpoint |

 ## Sample Configuration

--- a/docs/source/providers/inference/remote_vertexai.md
+++ b/docs/source/providers/inference/remote_vertexai.md
@ -1,40 +0,0 @@
-# remote::vertexai
-
-## Description
-
-Google Vertex AI inference provider enables you to use Google's Gemini models through Google Cloud's Vertex AI platform, providing several advantages:
-
-• Enterprise-grade security: Uses Google Cloud's security controls and IAM
-• Better integration: Seamless integration with other Google Cloud services
-• Advanced features: Access to additional Vertex AI features like model tuning and monitoring
-• Authentication: Uses Google Cloud Application Default Credentials (ADC) instead of API keys
-
-Configuration:
- Set VERTEX_AI_PROJECT environment variable (required)
- Set VERTEX_AI_LOCATION environment variable (optional, defaults to us-central1)
- Use Google Cloud Application Default Credentials or service account key
-
-Authentication Setup:
-Option 1 (Recommended): gcloud auth application-default login
-Option 2: Set GOOGLE_APPLICATION_CREDENTIALS to service account key path
-
-Available Models:
- vertex_ai/gemini-2.0-flash
- vertex_ai/gemini-2.5-flash
- vertex_ai/gemini-2.5-pro
-
-## Configuration
-
-| Field | Type | Required | Default | Description |
-|-------|------|----------|---------|-------------|
-| `project` | `<class 'str'>` | No |  | Google Cloud project ID for Vertex AI |
-| `location` | `<class 'str'>` | No | us-central1 | Google Cloud location for Vertex AI |
-
-## Sample Configuration
-
-```yaml
-project: ${env.VERTEX_AI_PROJECT:=}
-location: ${env.VERTEX_AI_LOCATION:=us-central1}
-
-```
-
--- a/docs/source/providers/post_training/index.md
+++ b/docs/source/providers/post_training/index.md
@ -1,4 +1,4 @@
-# Post_Training
+# Post_Training 

 ## Overview

--- a/docs/source/providers/post_training/inline_huggingface.md
+++ b/docs/source/providers/post_training/inline_huggingface.md
@ -27,7 +27,7 @@ HuggingFace-based post-training provider for fine-tuning models using the Huggin
 | `dpo_beta` | `<class 'float'>` | No | 0.1 |  |
 | `use_reference_model` | `<class 'bool'>` | No | True |  |
 | `dpo_loss_type` | `Literal['sigmoid', 'hinge', 'ipo', 'kto_pair'` | No | sigmoid |  |
-| `dpo_output_dir` | `<class 'str'>` | No |  |  |
+| `dpo_output_dir` | `<class 'str'>` | No | ./checkpoints/dpo |  |

 ## Sample Configuration

@ -35,7 +35,6 @@ HuggingFace-based post-training provider for fine-tuning models using the Huggin
 checkpoint_format: huggingface
 distributed_backend: null
 device: cpu
-dpo_output_dir: ~/.llama/dummy/dpo_output

 ```

--- a/docs/source/providers/safety/index.md
+++ b/docs/source/providers/safety/index.md
@ -1,4 +1,4 @@
-# Safety
+# Safety 

 ## Overview

--- a/docs/source/providers/scoring/index.md
+++ b/docs/source/providers/scoring/index.md
@ -1,4 +1,4 @@
-# Scoring
+# Scoring 

 ## Overview

--- a/docs/source/providers/telemetry/index.md
+++ b/docs/source/providers/telemetry/index.md
@ -1,4 +1,4 @@
-# Telemetry
+# Telemetry 

 ## Overview

--- a/docs/source/providers/tool_runtime/index.md
+++ b/docs/source/providers/tool_runtime/index.md
@ -1,4 +1,4 @@
-# Tool_Runtime
+# Tool_Runtime 

 ## Overview

--- a/docs/source/providers/vector_io/index.md
+++ b/docs/source/providers/vector_io/index.md
@ -1,4 +1,4 @@
-# Vector_Io
+# Vector_Io 

 ## Overview

--- a/docs/source/providers/vector_io/inline_chromadb.md
+++ b/docs/source/providers/vector_io/inline_chromadb.md
@ -41,7 +41,7 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `db_path` | `<class 'str'>` | No |  |  |
+| `db_path` | `<class 'str'>` | No | PydanticUndefined |  |
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |

 ## Sample Configuration
--- a/docs/source/providers/vector_io/inline_faiss.md
+++ b/docs/source/providers/vector_io/inline_faiss.md
@ -12,18 +12,6 @@ That means you'll get fast and efficient vector retrieval.
 - Lightweight and easy to use
 - Fully integrated with Llama Stack
 - GPU support
- **Vector search** - FAISS supports pure vector similarity search using embeddings
-
-## Search Modes
-
-**Supported:**
- **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings
-
-**Not Supported:**
- **Keyword Search** (`mode="keyword"`): Not supported by FAISS
- **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS
-
-> **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality.

 ## Usage

--- a/docs/source/providers/vector_io/inline_meta-reference.md
+++ b/docs/source/providers/vector_io/inline_meta-reference.md
@ -21,7 +21,5 @@ kvstore:

 ## Deprecation Notice

-```{warning}
-Please use the `inline::faiss` provider instead.
-```
+⚠️ **Warning**: Please use the `inline::faiss` provider instead.

--- a/docs/source/providers/vector_io/inline_milvus.md
+++ b/docs/source/providers/vector_io/inline_milvus.md
@ -10,7 +10,7 @@ Please refer to the remote provider documentation.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `db_path` | `<class 'str'>` | No |  |  |
+| `db_path` | `<class 'str'>` | No | PydanticUndefined |  |
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
 | `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |

--- a/docs/source/providers/vector_io/inline_qdrant.md
+++ b/docs/source/providers/vector_io/inline_qdrant.md
@ -50,7 +50,7 @@ See the [Qdrant documentation](https://qdrant.tech/documentation/) for more deta

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `path` | `<class 'str'>` | No |  |  |
+| `path` | `<class 'str'>` | No | PydanticUndefined |  |
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |

 ## Sample Configuration
--- a/docs/source/providers/vector_io/inline_sqlite-vec.md
+++ b/docs/source/providers/vector_io/inline_sqlite-vec.md
@ -205,7 +205,7 @@ See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) f

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `db_path` | `<class 'str'>` | No |  | Path to the SQLite database file |
+| `db_path` | `<class 'str'>` | No | PydanticUndefined | Path to the SQLite database file |
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |

 ## Sample Configuration
--- a/docs/source/providers/vector_io/inline_sqlite_vec.md
+++ b/docs/source/providers/vector_io/inline_sqlite_vec.md
@ -10,7 +10,7 @@ Please refer to the sqlite-vec provider documentation.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `db_path` | `<class 'str'>` | No |  | Path to the SQLite database file |
+| `db_path` | `<class 'str'>` | No | PydanticUndefined | Path to the SQLite database file |
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |

 ## Sample Configuration
@ -25,7 +25,5 @@ kvstore:

 ## Deprecation Notice

-```{warning}
-Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.
-```
+⚠️ **Warning**: Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.

--- a/docs/source/providers/vector_io/remote_chromadb.md
+++ b/docs/source/providers/vector_io/remote_chromadb.md
@ -40,7 +40,7 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `url` | `str \| None` | No |  |  |
+| `url` | `str \| None` | No | PydanticUndefined |  |
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |

 ## Sample Configuration
--- a/docs/source/providers/vector_io/remote_milvus.md
+++ b/docs/source/providers/vector_io/remote_milvus.md
@ -11,7 +11,6 @@ That means you're not limited to storing vectors in memory or in a separate serv

 - Easy to use
 - Fully integrated with Llama Stack
- Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations)

 ## Usage

@ -102,92 +101,6 @@ vector_io:
 - **`client_pem_path`**: Path to the **client certificate** file (required for mTLS).
 - **`client_key_path`**: Path to the **client private key** file (required for mTLS).

-## Search Modes
-
-Milvus supports three different search modes for both inline and remote configurations:
-
-### Vector Search
-Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content.
-
-```python
-# Vector search example
-search_response = client.vector_stores.search(
-    vector_store_id=vector_store.id,
-    query="What is machine learning?",
-    search_mode="vector",
-    max_num_results=5,
-)
-```
-
-### Keyword Search
-Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches.
-
-```python
-# Keyword search example
-search_response = client.vector_stores.search(
-    vector_store_id=vector_store.id,
-    query="Python programming language",
-    search_mode="keyword",
-    max_num_results=5,
-)
-```
-
-### Hybrid Search
-Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching.
-
-#### Basic Hybrid Search
-```python
-# Basic hybrid search example (uses RRF ranker with default impact_factor=60.0)
-search_response = client.vector_stores.search(
-    vector_store_id=vector_store.id,
-    query="neural networks in Python",
-    search_mode="hybrid",
-    max_num_results=5,
-)
-```
-
-**Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009).
-
-#### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker
-RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results.
-
-```python
-# Hybrid search with custom RRF parameters
-search_response = client.vector_stores.search(
-    vector_store_id=vector_store.id,
-    query="neural networks in Python",
-    search_mode="hybrid",
-    max_num_results=5,
-    ranking_options={
-        "ranker": {
-            "type": "rrf",
-            "impact_factor": 100.0,  # Higher values give more weight to top-ranked results
-        }
-    },
-)
-```
-
-#### Hybrid Search with Weighted Ranker
-Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods.
-
-```python
-# Hybrid search with weighted ranker
-search_response = client.vector_stores.search(
-    vector_store_id=vector_store.id,
-    query="neural networks in Python",
-    search_mode="hybrid",
-    max_num_results=5,
-    ranking_options={
-        "ranker": {
-            "type": "weighted",
-            "alpha": 0.7,  # 70% vector search, 30% keyword search
-        }
-    },
-)
-```
-
-For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md).
-
 ## Documentation
 See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.

@ -198,16 +111,13 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `uri` | `<class 'str'>` | No |  | The URI of the Milvus server |
-| `token` | `str \| None` | No |  | The token of the Milvus server |
+| `uri` | `<class 'str'>` | No | PydanticUndefined | The URI of the Milvus server |
+| `token` | `str \| None` | No | PydanticUndefined | The token of the Milvus server |
 | `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
 | `config` | `dict` | No | {} | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. |

-```{note}
- This configuration class accepts additional fields beyond those listed above. You can pass any additional configuration options that will be forwarded to the underlying provider.
- ```
-
+> **Note**: This configuration class accepts additional fields beyond those listed above. You can pass any additional configuration options that will be forwarded to the underlying provider.

 ## Sample Configuration

--- a/docs/source/references/llama_cli_reference/download_models.md
+++ b/docs/source/references/llama_cli_reference/download_models.md
@ -19,7 +19,7 @@ You have two ways to install Llama Stack:
    cd ~/local
    git clone git@github.com:meta-llama/llama-stack.git

-    uv venv myenv --python 3.12
+    python -m venv myenv
    source myenv/bin/activate  # On Windows: myenv\Scripts\activate

    cd llama-stack
@ -128,9 +128,7 @@ llama download --source huggingface --model-id Prompt-Guard-86M --ignore-pattern

 **Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).

-```{tip}
-Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
-```
+> **Tip:** Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.

 ## List the downloaded models

--- a/docs/source/references/llama_cli_reference/index.md
+++ b/docs/source/references/llama_cli_reference/index.md
@ -19,7 +19,7 @@ You have two ways to install Llama Stack:
    cd ~/local
    git clone git@github.com:meta-llama/llama-stack.git

-    uv venv myenv --python 3.12
+    python -m venv myenv
    source myenv/bin/activate  # On Windows: myenv\Scripts\activate

    cd llama-stack
@ -152,9 +152,7 @@ llama download --source huggingface --model-id Prompt-Guard-86M --ignore-pattern

 **Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).

-```{tip}
-Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
-```
+> **Tip:** Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.

 ## List the downloaded models

--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -706,7 +706,6 @@ class Agents(Protocol):
        temperature: float | None = None,
        text: OpenAIResponseText | None = None,
        tools: list[OpenAIResponseInputTool] | None = None,
-        include: list[str] | None = None,
        max_infer_iters: int | None = 10,  # this is an extension to the OpenAI API
    ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
        """Create a new OpenAI response.
@ -714,7 +713,6 @@ class Agents(Protocol):
        :param input: Input message(s) to create the response.
        :param model: The underlying LLM used for completions.
        :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
-        :param include: (Optional) Additional fields to include in the response.
        :returns: An OpenAIResponseObject.
        """
        ...
--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@ -170,23 +170,6 @@ class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
    type: Literal["web_search_call"] = "web_search_call"


-class OpenAIResponseOutputMessageFileSearchToolCallResults(BaseModel):
-    """Search results returned by the file search operation.
-
-    :param attributes: (Optional) Key-value attributes associated with the file
-    :param file_id: Unique identifier of the file containing the result
-    :param filename: Name of the file containing the result
-    :param score: Relevance score for this search result (between 0 and 1)
-    :param text: Text content of the search result
-    """
-
-    attributes: dict[str, Any]
-    file_id: str
-    filename: str
-    score: float
-    text: str
-
-
@json_schema_type
 class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
    """File search tool call output message for OpenAI responses.
@ -202,7 +185,7 @@ class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
    queries: list[str]
    status: str
    type: Literal["file_search_call"] = "file_search_call"
-    results: list[OpenAIResponseOutputMessageFileSearchToolCallResults] | None = None
+    results: list[dict[str, Any]] | None = None


@json_schema_type
@ -623,62 +606,6 @@ class OpenAIResponseObjectStreamResponseMcpCallCompleted(BaseModel):
    type: Literal["response.mcp_call.completed"] = "response.mcp_call.completed"


-@json_schema_type
-class OpenAIResponseContentPartOutputText(BaseModel):
-    type: Literal["output_text"] = "output_text"
-    text: str
-    # TODO: add annotations, logprobs, etc.
-
-
-@json_schema_type
-class OpenAIResponseContentPartRefusal(BaseModel):
-    type: Literal["refusal"] = "refusal"
-    refusal: str
-
-
-OpenAIResponseContentPart = Annotated[
-    OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal,
-    Field(discriminator="type"),
-]
-register_schema(OpenAIResponseContentPart, name="OpenAIResponseContentPart")
-
-
-@json_schema_type
-class OpenAIResponseObjectStreamResponseContentPartAdded(BaseModel):
-    """Streaming event for when a new content part is added to a response item.
-
-    :param response_id: Unique identifier of the response containing this content
-    :param item_id: Unique identifier of the output item containing this content part
-    :param part: The content part that was added
-    :param sequence_number: Sequential number for ordering streaming events
-    :param type: Event type identifier, always "response.content_part.added"
-    """
-
-    response_id: str
-    item_id: str
-    part: OpenAIResponseContentPart
-    sequence_number: int
-    type: Literal["response.content_part.added"] = "response.content_part.added"
-
-
-@json_schema_type
-class OpenAIResponseObjectStreamResponseContentPartDone(BaseModel):
-    """Streaming event for when a content part is completed.
-
-    :param response_id: Unique identifier of the response containing this content
-    :param item_id: Unique identifier of the output item containing this content part
-    :param part: The completed content part
-    :param sequence_number: Sequential number for ordering streaming events
-    :param type: Event type identifier, always "response.content_part.done"
-    """
-
-    response_id: str
-    item_id: str
-    part: OpenAIResponseContentPart
-    sequence_number: int
-    type: Literal["response.content_part.done"] = "response.content_part.done"
-
-
 OpenAIResponseObjectStream = Annotated[
    OpenAIResponseObjectStreamResponseCreated
    | OpenAIResponseObjectStreamResponseOutputItemAdded
@ -698,8 +625,6 @@ OpenAIResponseObjectStream = Annotated[
    | OpenAIResponseObjectStreamResponseMcpCallInProgress
    | OpenAIResponseObjectStreamResponseMcpCallFailed
    | OpenAIResponseObjectStreamResponseMcpCallCompleted
-    | OpenAIResponseObjectStreamResponseContentPartAdded
-    | OpenAIResponseObjectStreamResponseContentPartDone
    | OpenAIResponseObjectStreamResponseCompleted,
    Field(discriminator="type"),
 ]
--- a/llama_stack/apis/batches/init.py
+++ b/llama_stack/apis/batches/init.py
@ -1,9 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .batches import Batches, BatchObject, ListBatchesResponse
-
-__all__ = ["Batches", "BatchObject", "ListBatchesResponse"]
--- a/llama_stack/apis/batches/batches.py
+++ b/llama_stack/apis/batches/batches.py
@ -1,89 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Literal, Protocol, runtime_checkable
-
-from pydantic import BaseModel, Field
-
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-try:
-    from openai.types import Batch as BatchObject
-except ImportError as e:
-    raise ImportError("OpenAI package is required for batches API. Please install it with: pip install openai") from e
-
-
-@json_schema_type
-class ListBatchesResponse(BaseModel):
-    """Response containing a list of batch objects."""
-
-    object: Literal["list"] = "list"
-    data: list[BatchObject] = Field(..., description="List of batch objects")
-    first_id: str | None = Field(default=None, description="ID of the first batch in the list")
-    last_id: str | None = Field(default=None, description="ID of the last batch in the list")
-    has_more: bool = Field(default=False, description="Whether there are more batches available")
-
-
-@runtime_checkable
-class Batches(Protocol):
-    """Protocol for batch processing API operations.
-
-    The Batches API enables efficient processing of multiple requests in a single operation,
-    particularly useful for processing large datasets, batch evaluation workflows, and
-    cost-effective inference at scale.
-
-    Note: This API is currently under active development and may undergo changes.
-    """
-
-    @webmethod(route="/openai/v1/batches", method="POST")
-    async def create_batch(
-        self,
-        input_file_id: str,
-        endpoint: str,
-        completion_window: Literal["24h"],
-        metadata: dict[str, str] | None = None,
-    ) -> BatchObject:
-        """Create a new batch for processing multiple API requests.
-
-        :param input_file_id: The ID of an uploaded file containing requests for the batch.
-        :param endpoint: The endpoint to be used for all requests in the batch.
-        :param completion_window: The time window within which the batch should be processed.
-        :param metadata: Optional metadata for the batch.
-        :returns: The created batch object.
-        """
-        ...
-
-    @webmethod(route="/openai/v1/batches/{batch_id}", method="GET")
-    async def retrieve_batch(self, batch_id: str) -> BatchObject:
-        """Retrieve information about a specific batch.
-
-        :param batch_id: The ID of the batch to retrieve.
-        :returns: The batch object.
-        """
-        ...
-
-    @webmethod(route="/openai/v1/batches/{batch_id}/cancel", method="POST")
-    async def cancel_batch(self, batch_id: str) -> BatchObject:
-        """Cancel a batch that is in progress.
-
-        :param batch_id: The ID of the batch to cancel.
-        :returns: The updated batch object.
-        """
-        ...
-
-    @webmethod(route="/openai/v1/batches", method="GET")
-    async def list_batches(
-        self,
-        after: str | None = None,
-        limit: int = 20,
-    ) -> ListBatchesResponse:
-        """List all batches for the current user.
-
-        :param after: A cursor for pagination; returns batches after this batch ID.
-        :param limit: Number of batches to return (default 20, max 100).
-        :returns: A list of batch objects.
-        """
-        ...
--- a/llama_stack/apis/common/errors.py
+++ b/llama_stack/apis/common/errors.py
@ -10,16 +10,6 @@
 #   3. All classes should propogate the inherited __init__ function otherwise via 'super().__init__(message)'


-class ResourceNotFoundError(ValueError):
-    """generic exception for a missing Llama Stack resource"""
-
-    def __init__(self, resource_name: str, resource_type: str, client_list: str) -> None:
-        message = (
-            f"{resource_type} '{resource_name}' not found. Use '{client_list}' to list available {resource_type}s."
-        )
-        super().__init__(message)
-
-
 class UnsupportedModelError(ValueError):
    """raised when model is not present in the list of supported models"""

@ -28,32 +18,38 @@ class UnsupportedModelError(ValueError):
        super().__init__(message)


-class ModelNotFoundError(ResourceNotFoundError):
+class ModelNotFoundError(ValueError):
    """raised when Llama Stack cannot find a referenced model"""

    def __init__(self, model_name: str) -> None:
-        super().__init__(model_name, "Model", "client.models.list()")
+        message = f"Model '{model_name}' not found. Use client.models.list() to list available models."
+        super().__init__(message)


-class VectorStoreNotFoundError(ResourceNotFoundError):
+class VectorStoreNotFoundError(ValueError):
    """raised when Llama Stack cannot find a referenced vector store"""

    def __init__(self, vector_store_name: str) -> None:
-        super().__init__(vector_store_name, "Vector Store", "client.vector_dbs.list()")
+        message = f"Vector store '{vector_store_name}' not found. Use client.vector_dbs.list() to list available vector stores."
+        super().__init__(message)


-class DatasetNotFoundError(ResourceNotFoundError):
+class DatasetNotFoundError(ValueError):
    """raised when Llama Stack cannot find a referenced dataset"""

    def __init__(self, dataset_name: str) -> None:
-        super().__init__(dataset_name, "Dataset", "client.datasets.list()")
+        message = f"Dataset '{dataset_name}' not found. Use client.datasets.list() to list available datasets."
+        super().__init__(message)


-class ToolGroupNotFoundError(ResourceNotFoundError):
+class ToolGroupNotFoundError(ValueError):
    """raised when Llama Stack cannot find a referenced tool group"""

    def __init__(self, toolgroup_name: str) -> None:
-        super().__init__(toolgroup_name, "Tool Group", "client.toolgroups.list()")
+        message = (
+            f"Tool group '{toolgroup_name}' not found. Use client.toolgroups.list() to list available tool groups."
+        )
+        super().__init__(message)


 class SessionNotFoundError(ValueError):
@ -62,20 +58,3 @@ class SessionNotFoundError(ValueError):
    def __init__(self, session_name: str) -> None:
        message = f"Session '{session_name}' not found or access denied."
        super().__init__(message)
-
-
-class ModelTypeError(TypeError):
-    """raised when a model is present but not the correct type"""
-
-    def __init__(self, model_name: str, model_type: str, expected_model_type: str) -> None:
-        message = (
-            f"Model '{model_name}' is of type '{model_type}' rather than the expected type '{expected_model_type}'"
-        )
-        super().__init__(message)
-
-
-class ConflictError(ValueError):
-    """raised when an operation cannot be performed due to a conflict with the current state"""
-
-    def __init__(self, message: str) -> None:
-        super().__init__(message)
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@ -86,7 +86,6 @@ class Api(Enum, metaclass=DynamicApiMeta):
    :cvar inference: Text generation, chat completions, and embeddings
    :cvar safety: Content moderation and safety shields
    :cvar agents: Agent orchestration and execution
-    :cvar batches: Batch processing for asynchronous API requests
    :cvar vector_io: Vector database operations and queries
    :cvar datasetio: Dataset input/output operations
    :cvar scoring: Model output evaluation and scoring
@ -109,7 +108,6 @@ class Api(Enum, metaclass=DynamicApiMeta):
    inference = "inference"
    safety = "safety"
    agents = "agents"
-    batches = "batches"
    vector_io = "vector_io"
    datasetio = "datasetio"
    scoring = "scoring"
--- a/llama_stack/apis/files/files.py
+++ b/llama_stack/apis/files/files.py
@ -22,7 +22,6 @@ class OpenAIFilePurpose(StrEnum):
    """

    ASSISTANTS = "assistants"
-    BATCH = "batch"
    # TODO: Add other purposes as needed


--- a/llama_stack/apis/safety/safety.py
+++ b/llama_stack/apis/safety/safety.py
@ -15,36 +15,6 @@ from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


-@json_schema_type
-class ModerationObjectResults(BaseModel):
-    """A moderation object.
-    :param flagged: Whether any of the below categories are flagged.
-    :param categories: A list of the categories, and whether they are flagged or not.
-    :param category_applied_input_types: A list of the categories along with the input type(s) that the score applies to.
-    :param category_scores: A list of the categories along with their scores as predicted by model.
-    """
-
-    flagged: bool
-    categories: dict[str, bool] | None = None
-    category_applied_input_types: dict[str, list[str]] | None = None
-    category_scores: dict[str, float] | None = None
-    user_message: str | None = None
-    metadata: dict[str, Any] = Field(default_factory=dict)
-
-
-@json_schema_type
-class ModerationObject(BaseModel):
-    """A moderation object.
-    :param id: The unique identifier for the moderation request.
-    :param model: The model used to generate the moderation results.
-    :param results: A list of moderation objects
-    """
-
-    id: str
-    model: str
-    results: list[ModerationObjectResults]
-
-
@json_schema_type
 class ViolationLevel(Enum):
    """Severity level of a safety violation.
@ -112,13 +82,3 @@ class Safety(Protocol):
        :returns: A RunShieldResponse.
        """
        ...
-
-    @webmethod(route="/openai/v1/moderations", method="POST")
-    async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
-        """Classifies if text and/or image inputs are potentially harmful.
-        :param input: Input (or inputs) to classify.
-        Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models.
-        :param model: The content moderation model you would like to use.
-        :returns: A moderation object.
-        """
-        ...
--- a/llama_stack/apis/shields/shields.py
+++ b/llama_stack/apis/shields/shields.py
@ -83,11 +83,3 @@ class Shields(Protocol):
        :returns: A Shield.
        """
        ...
-
-    @webmethod(route="/shields/{identifier:path}", method="DELETE")
-    async def unregister_shield(self, identifier: str) -> None:
-        """Unregister a shield.
-
-        :param identifier: The identifier of the shield to unregister.
-        """
-        ...
--- a/llama_stack/core/build.py
+++ b/llama_stack/core/build.py
@ -5,6 +5,7 @@
 # the root directory of this source tree.

 import importlib.resources
+import logging
 import sys

 from pydantic import BaseModel
@ -16,10 +17,9 @@ from llama_stack.core.external import load_external_apis
 from llama_stack.core.utils.exec import run_command
 from llama_stack.core.utils.image_types import LlamaStackImageType
 from llama_stack.distributions.template import DistributionTemplate
-from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api

-log = get_logger(name=__name__, category="core")
+log = logging.getLogger(__name__)

 # These are the dependencies needed by the distribution server.
 # `llama-stack` is automatically installed by the installation script.
@ -91,7 +91,7 @@ def get_provider_dependencies(


 def print_pip_install_help(config: BuildConfig):
-    normal_deps, special_deps, _ = get_provider_dependencies(config)
+    normal_deps, special_deps = get_provider_dependencies(config)

    cprint(
        f"Please install needed dependencies using the following commands:\n\nuv pip install {' '.join(normal_deps)}",
--- a/llama_stack/core/build_conda_env.sh
+++ b/llama_stack/core/build_conda_env.sh
@ -0,0 +1,207 @@
+#!/bin/bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
+LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-}
+TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
+PYPI_VERSION=${PYPI_VERSION:-}
+# This timeout (in seconds) is necessary when installing PyTorch via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}
+
+set -euo pipefail
+
+# Define color codes
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+NC='\033[0m' # No Color
+
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+source "$SCRIPT_DIR/common.sh"
+
+# Usage function
+usage() {
+  echo "Usage: $0 --env-name <conda_env_name> --build-file-path <build_file_path> --normal-deps <pip_dependencies> [--external-provider-deps <external_provider_deps>] [--optional-deps <special_pip_deps>]"
+  echo "Example: $0 --env-name my-conda-env --build-file-path ./my-stack-build.yaml --normal-deps 'numpy pandas scipy' --external-provider-deps 'foo' --optional-deps 'bar'"
+  exit 1
+}
+
+# Parse arguments
+env_name=""
+build_file_path=""
+normal_deps=""
+external_provider_deps=""
+optional_deps=""
+
+while [[ $# -gt 0 ]]; do
+  key="$1"
+  case "$key" in
+    --env-name)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --env-name requires a string value" >&2
+        usage
+      fi
+      env_name="$2"
+      shift 2
+      ;;
+    --build-file-path)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --build-file-path requires a string value" >&2
+        usage
+      fi
+      build_file_path="$2"
+      shift 2
+      ;;
+    --normal-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --normal-deps requires a string value" >&2
+        usage
+      fi
+      normal_deps="$2"
+      shift 2
+      ;;
+    --external-provider-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --external-provider-deps requires a string value" >&2
+        usage
+      fi
+      external_provider_deps="$2"
+      shift 2
+      ;;
+    --optional-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --optional-deps requires a string value" >&2
+        usage
+      fi
+      optional_deps="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage
+      ;;
+  esac
+done
+
+# Check required arguments
+if [[ -z "$env_name" || -z "$build_file_path" || -z "$normal_deps" ]]; then
+  echo "Error: --env-name, --build-file-path, and --normal-deps are required." >&2
+  usage
+fi
+
+if [ -n "$LLAMA_STACK_DIR" ]; then
+  echo "Using llama-stack-dir=$LLAMA_STACK_DIR"
+fi
+if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
+  echo "Using llama-stack-client-dir=$LLAMA_STACK_CLIENT_DIR"
+fi
+
+ensure_conda_env_python310() {
+  # Use only global variables set by flag parser
+  local python_version="3.12"
+
+  if ! is_command_available conda; then
+    printf "${RED}Error: conda command not found. Is Conda installed and in your PATH?${NC}" >&2
+    exit 1
+  fi
+
+  if conda env list | grep -q "^${env_name} "; then
+    printf "Conda environment '${env_name}' exists. Checking Python version...\n"
+    current_version=$(conda run -n "${env_name}" python --version 2>&1 | cut -d' ' -f2 | cut -d'.' -f1,2)
+    if [ "$current_version" = "$python_version" ]; then
+      printf "Environment '${env_name}' already has Python ${python_version}. No action needed.\n"
+    else
+      printf "Updating environment '${env_name}' to Python ${python_version}...\n"
+      conda install -n "${env_name}" python="${python_version}" -y
+    fi
+  else
+    printf "Conda environment '${env_name}' does not exist. Creating with Python ${python_version}...\n"
+    conda create -n "${env_name}" python="${python_version}" -y
+  fi
+
+  eval "$(conda shell.bash hook)"
+  conda deactivate && conda activate "${env_name}"
+  "$CONDA_PREFIX"/bin/pip install uv
+
+  if [ -n "$TEST_PYPI_VERSION" ]; then
+    uv pip install fastapi libcst
+    uv pip install --extra-index-url https://test.pypi.org/simple/ \
+      llama-stack=="$TEST_PYPI_VERSION" \
+      "$normal_deps"
+    if [ -n "$optional_deps" ]; then
+      IFS='#' read -ra parts <<<"$optional_deps"
+      for part in "${parts[@]}"; do
+        echo "$part"
+        uv pip install $part
+      done
+    fi
+    if [ -n "$external_provider_deps" ]; then
+      IFS='#' read -ra parts <<<"$external_provider_deps"
+      for part in "${parts[@]}"; do
+        echo "$part"
+        uv pip install "$part"
+      done
+    fi
+  else
+    if [ -n "$LLAMA_STACK_DIR" ]; then
+      if [ ! -d "$LLAMA_STACK_DIR" ]; then
+        printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: $LLAMA_STACK_DIR${NC}\n" >&2
+        exit 1
+      fi
+      printf "Installing from LLAMA_STACK_DIR: $LLAMA_STACK_DIR\n"
+      uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"
+    else
+      PYPI_VERSION="${PYPI_VERSION:-}"
+      if [ -n "$PYPI_VERSION" ]; then
+        SPEC_VERSION="llama-stack==${PYPI_VERSION}"
+      else
+        SPEC_VERSION="llama-stack"
+      fi
+      uv pip install --no-cache-dir "$SPEC_VERSION"
+    fi
+    if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
+      if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then
+        printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: $LLAMA_STACK_CLIENT_DIR${NC}\n" >&2
+        exit 1
+      fi
+      printf "Installing from LLAMA_STACK_CLIENT_DIR: $LLAMA_STACK_CLIENT_DIR\n"
+      uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
+    fi
+    printf "Installing pip dependencies\n"
+    uv pip install $normal_deps
+    if [ -n "$optional_deps" ]; then
+      IFS='#' read -ra parts <<<"$optional_deps"
+      for part in "${parts[@]}"; do
+        echo "$part"
+        uv pip install $part
+      done
+    fi
+    if [ -n "$external_provider_deps" ]; then
+      IFS='#' read -ra parts <<<"$external_provider_deps"
+      for part in "${parts[@]}"; do
+        echo "Getting provider spec for module: $part and installing dependencies"
+        package_name=$(echo "$part" | sed 's/[<>=!].*//')
+        python3 -c "
+import importlib
+import sys
+try:
+    module = importlib.import_module(f'$package_name.provider')
+    spec = module.get_provider_spec()
+    if hasattr(spec, 'pip_packages') and spec.pip_packages:
+        print('\\n'.join(spec.pip_packages))
+except Exception as e:
+    print(f'Error getting provider spec for $package_name: {e}', file=sys.stderr)
+" | uv pip install -r -
+      done
+    fi
+  fi
+  mv "$build_file_path" "$CONDA_PREFIX"/llamastack-build.yaml
+  echo "Build spec configuration saved at $CONDA_PREFIX/llamastack-build.yaml"
+}
+
+ensure_conda_env_python310 "$env_name" "$build_file_path" "$normal_deps" "$optional_deps" "$external_provider_deps"
--- a/llama_stack/core/build_venv.sh
+++ b/llama_stack/core/build_venv.sh
@ -151,37 +151,23 @@ run() {
    fi
  else
    if [ -n "$LLAMA_STACK_DIR" ]; then
-      # only warn if DIR does not start with "git+"
-      if [ ! -d "$LLAMA_STACK_DIR" ] && [[ "$LLAMA_STACK_DIR" != git+* ]]; then
+      if [ ! -d "$LLAMA_STACK_DIR" ]; then
        printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_DIR" >&2
        exit 1
      fi
      printf "Installing from LLAMA_STACK_DIR: %s\n"  "$LLAMA_STACK_DIR"
-      # editable only if LLAMA_STACK_DIR does not start with "git+"
-      if [[ "$LLAMA_STACK_DIR" != git+* ]]; then
-        EDITABLE="-e"
-      else
-        EDITABLE=""
-      fi
-      uv pip install --no-cache-dir $EDITABLE "$LLAMA_STACK_DIR"
+      uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"
    else
      uv pip install --no-cache-dir llama-stack
    fi

    if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
-      # only warn if DIR does not start with "git+"
-      if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ] && [[ "$LLAMA_STACK_CLIENT_DIR" != git+* ]]; then
+      if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then
        printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_CLIENT_DIR" >&2
        exit 1
      fi
      printf "Installing from LLAMA_STACK_CLIENT_DIR: %s\n" "$LLAMA_STACK_CLIENT_DIR"
-      # editable only if LLAMA_STACK_CLIENT_DIR does not start with "git+"
-      if [[ "$LLAMA_STACK_CLIENT_DIR" != git+* ]]; then
-        EDITABLE="-e"
-      else
-        EDITABLE=""
-      fi
-      uv pip install --no-cache-dir $EDITABLE "$LLAMA_STACK_CLIENT_DIR"
+      uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
    fi

    printf "Installing pip dependencies\n"
--- a/llama_stack/core/configure.py
+++ b/llama_stack/core/configure.py
@ -3,6 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import logging
 import textwrap
 from typing import Any

@ -20,10 +21,9 @@ from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars
 from llama_stack.core.utils.config_dirs import EXTERNAL_PROVIDERS_DIR
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.core.utils.prompt_for_config import prompt_for_config
-from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api, ProviderSpec

-logger = get_logger(name=__name__, category="core")
+logger = logging.getLogger(__name__)


 def configure_single_provider(registry: dict[str, ProviderSpec], provider: Provider) -> Provider:
--- a/llama_stack/core/library_client.py
+++ b/llama_stack/core/library_client.py
@ -7,7 +7,7 @@
 import asyncio
 import inspect
 import json
-import logging  # allow-direct-logging
+import logging
 import os
 import sys
 from concurrent.futures import ThreadPoolExecutor
@ -48,7 +48,6 @@ from llama_stack.core.stack import (
 from llama_stack.core.utils.config import redact_sensitive_fields
 from llama_stack.core.utils.context import preserve_contexts_async_generator
 from llama_stack.core.utils.exec import in_notebook
-from llama_stack.log import get_logger
 from llama_stack.providers.utils.telemetry.tracing import (
    CURRENT_TRACE_CONTEXT,
    end_trace,
@ -56,7 +55,7 @@ from llama_stack.providers.utils.telemetry.tracing import (
    start_trace,
 )

-logger = get_logger(name=__name__, category="core")
+logger = logging.getLogger(__name__)

 T = TypeVar("T")

@ -381,17 +380,8 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        json_content = json.dumps(convert_pydantic_to_json_value(result))

        filtered_body = {k: v for k, v in body.items() if not isinstance(v, LibraryClientUploadFile)}
-
-        status_code = httpx.codes.OK
-
-        if options.method.upper() == "DELETE" and result is None:
-            status_code = httpx.codes.NO_CONTENT
-
-        if status_code == httpx.codes.NO_CONTENT:
-            json_content = ""
-
        mock_response = httpx.Response(
-            status_code=status_code,
+            status_code=httpx.codes.OK,
            content=json_content.encode("utf-8"),
            headers={
                "Content-Type": "application/json",
--- a/llama_stack/core/request_headers.py
+++ b/llama_stack/core/request_headers.py
@ -6,15 +6,15 @@

 import contextvars
 import json
+import logging
 from contextlib import AbstractContextManager
 from typing import Any

 from llama_stack.core.datatypes import User
-from llama_stack.log import get_logger

 from .utils.dynamic import instantiate_class_type

-log = get_logger(name=__name__, category="core")
+log = logging.getLogger(__name__)

 # Context variable for request provider data and auth attributes
 PROVIDER_DATA_VAR = contextvars.ContextVar("provider_data", default=None)
--- a/llama_stack/core/resolver.py
+++ b/llama_stack/core/resolver.py
@ -8,7 +8,6 @@ import inspect
 from typing import Any

 from llama_stack.apis.agents import Agents
-from llama_stack.apis.batches import Batches
 from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
@ -76,7 +75,6 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
        Api.agents: Agents,
        Api.inference: Inference,
        Api.inspect: Inspect,
-        Api.batches: Batches,
        Api.vector_io: VectorIO,
        Api.vector_dbs: VectorDBs,
        Api.models: Models,
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@ -7,7 +7,6 @@
 import asyncio
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
-from datetime import UTC, datetime
 from typing import Annotated, Any

 from openai.types.chat import ChatCompletionToolChoiceOptionParam as OpenAIChatCompletionToolChoiceOptionParam
@ -18,7 +17,7 @@ from llama_stack.apis.common.content_types import (
    InterleavedContent,
    InterleavedContentItem,
 )
-from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
+from llama_stack.apis.common.errors import ModelNotFoundError
 from llama_stack.apis.inference import (
    BatchChatCompletionResponse,
    BatchCompletionResponse,
@ -26,21 +25,14 @@ from llama_stack.apis.inference import (
    ChatCompletionResponseEventType,
    ChatCompletionResponseStreamChunk,
    CompletionMessage,
-    CompletionResponse,
-    CompletionResponseStreamChunk,
    EmbeddingsResponse,
    EmbeddingTaskType,
    Inference,
    ListOpenAIChatCompletionResponse,
    LogProbConfig,
    Message,
-    OpenAIAssistantMessageParam,
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
-    OpenAIChatCompletionToolCall,
-    OpenAIChatCompletionToolCallFunction,
-    OpenAIChoice,
-    OpenAIChoiceLogprobs,
    OpenAICompletion,
    OpenAICompletionWithInputMessages,
    OpenAIEmbeddingsResponse,
@ -63,9 +55,10 @@ from llama_stack.models.llama.llama3.chat_format import ChatFormat
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
 from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
 from llama_stack.providers.utils.inference.inference_store import InferenceStore
+from llama_stack.providers.utils.inference.stream_utils import stream_and_store_openai_completion
 from llama_stack.providers.utils.telemetry.tracing import get_current_span

-logger = get_logger(name=__name__, category="inference")
+logger = get_logger(name=__name__, category="core")


 class InferenceRouter(Inference):
@ -126,7 +119,6 @@ class InferenceRouter(Inference):
        if span is None:
            logger.warning("No span found for token usage metrics")
            return []
-
        metrics = [
            ("prompt_tokens", prompt_tokens),
            ("completion_tokens", completion_tokens),
@ -140,7 +132,7 @@ class InferenceRouter(Inference):
                    span_id=span.span_id,
                    metric=metric_name,
                    value=value,
-                    timestamp=datetime.now(UTC),
+                    timestamp=time.time(),
                    unit="tokens",
                    attributes={
                        "model_id": model.model_id,
@ -177,15 +169,6 @@ class InferenceRouter(Inference):
            encoded = self.formatter.encode_content(messages)
        return len(encoded.tokens) if encoded and encoded.tokens else 0

-    async def _get_model(self, model_id: str, expected_model_type: str) -> Model:
-        """takes a model id and gets model after ensuring that it is accessible and of the correct type"""
-        model = await self.routing_table.get_model(model_id)
-        if model is None:
-            raise ModelNotFoundError(model_id)
-        if model.model_type != expected_model_type:
-            raise ModelTypeError(model_id, model.model_type, expected_model_type)
-        return model
-
    async def chat_completion(
        self,
        model_id: str,
@ -204,7 +187,11 @@ class InferenceRouter(Inference):
        )
        if sampling_params is None:
            sampling_params = SamplingParams()
-        model = await self._get_model(model_id, ModelType.llm)
+        model = await self.routing_table.get_model(model_id)
+        if model is None:
+            raise ModelNotFoundError(model_id)
+        if model.model_type == ModelType.embedding:
+            raise ValueError(f"Model '{model_id}' is an embedding model and does not support chat completions")
        if tool_config:
            if tool_choice and tool_choice != tool_config.tool_choice:
                raise ValueError("tool_choice and tool_config.tool_choice must match")
@ -247,26 +234,49 @@ class InferenceRouter(Inference):
        prompt_tokens = await self._count_tokens(messages, tool_config.tool_prompt_format)

        if stream:
-            response_stream = await provider.chat_completion(**params)
-            return self.stream_tokens_and_compute_metrics(
-                response=response_stream,
-                prompt_tokens=prompt_tokens,
-                model=model,
-                tool_prompt_format=tool_config.tool_prompt_format,
-            )

-        response = await provider.chat_completion(**params)
-        metrics = await self.count_tokens_and_compute_metrics(
-            response=response,
-            prompt_tokens=prompt_tokens,
-            model=model,
-            tool_prompt_format=tool_config.tool_prompt_format,
-        )
-        # these metrics will show up in the client response.
-        response.metrics = (
-            metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
-        )
-        return response
+            async def stream_generator():
+                completion_text = ""
+                async for chunk in await provider.chat_completion(**params):
+                    if chunk.event.event_type == ChatCompletionResponseEventType.progress:
+                        if chunk.event.delta.type == "text":
+                            completion_text += chunk.event.delta.text
+                    if chunk.event.event_type == ChatCompletionResponseEventType.complete:
+                        completion_tokens = await self._count_tokens(
+                            [
+                                CompletionMessage(
+                                    content=completion_text,
+                                    stop_reason=StopReason.end_of_turn,
+                                )
+                            ],
+                            tool_config.tool_prompt_format,
+                        )
+                        total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
+                        metrics = await self._compute_and_log_token_usage(
+                            prompt_tokens or 0,
+                            completion_tokens or 0,
+                            total_tokens,
+                            model,
+                        )
+                        chunk.metrics = metrics if chunk.metrics is None else chunk.metrics + metrics
+                    yield chunk
+
+            return stream_generator()
+        else:
+            response = await provider.chat_completion(**params)
+            completion_tokens = await self._count_tokens(
+                [response.completion_message],
+                tool_config.tool_prompt_format,
+            )
+            total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
+            metrics = await self._compute_and_log_token_usage(
+                prompt_tokens or 0,
+                completion_tokens or 0,
+                total_tokens,
+                model,
+            )
+            response.metrics = metrics if response.metrics is None else response.metrics + metrics
+            return response

    async def batch_chat_completion(
        self,
@ -306,7 +316,11 @@ class InferenceRouter(Inference):
        logger.debug(
            f"InferenceRouter.completion: {model_id=}, {stream=}, {content=}, {sampling_params=}, {response_format=}",
        )
-        model = await self._get_model(model_id, ModelType.llm)
+        model = await self.routing_table.get_model(model_id)
+        if model is None:
+            raise ModelNotFoundError(model_id)
+        if model.model_type == ModelType.embedding:
+            raise ValueError(f"Model '{model_id}' is an embedding model and does not support chat completions")
        provider = await self.routing_table.get_provider_impl(model_id)
        params = dict(
            model_id=model_id,
@ -318,20 +332,39 @@ class InferenceRouter(Inference):
        )

        prompt_tokens = await self._count_tokens(content)
-        response = await provider.completion(**params)
+
        if stream:
-            return self.stream_tokens_and_compute_metrics(
-                response=response,
-                prompt_tokens=prompt_tokens,
-                model=model,
+
+            async def stream_generator():
+                completion_text = ""
+                async for chunk in await provider.completion(**params):
+                    if hasattr(chunk, "delta"):
+                        completion_text += chunk.delta
+                    if hasattr(chunk, "stop_reason") and chunk.stop_reason and self.telemetry:
+                        completion_tokens = await self._count_tokens(completion_text)
+                        total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
+                        metrics = await self._compute_and_log_token_usage(
+                            prompt_tokens or 0,
+                            completion_tokens or 0,
+                            total_tokens,
+                            model,
+                        )
+                        chunk.metrics = metrics if chunk.metrics is None else chunk.metrics + metrics
+                    yield chunk
+
+            return stream_generator()
+        else:
+            response = await provider.completion(**params)
+            completion_tokens = await self._count_tokens(response.content)
+            total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
+            metrics = await self._compute_and_log_token_usage(
+                prompt_tokens or 0,
+                completion_tokens or 0,
+                total_tokens,
+                model,
            )
-
-        metrics = await self.count_tokens_and_compute_metrics(
-            response=response, prompt_tokens=prompt_tokens, model=model
-        )
-        response.metrics = metrics if response.metrics is None else response.metrics + metrics
-
-        return response
+            response.metrics = metrics if response.metrics is None else response.metrics + metrics
+            return response

    async def batch_completion(
        self,
@ -356,7 +389,11 @@ class InferenceRouter(Inference):
        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        logger.debug(f"InferenceRouter.embeddings: {model_id}")
-        await self._get_model(model_id, ModelType.embedding)
+        model = await self.routing_table.get_model(model_id)
+        if model is None:
+            raise ModelNotFoundError(model_id)
+        if model.model_type == ModelType.llm:
+            raise ValueError(f"Model '{model_id}' is an LLM model and does not support embeddings")
        provider = await self.routing_table.get_provider_impl(model_id)
        return await provider.embeddings(
            model_id=model_id,
@ -392,7 +429,12 @@ class InferenceRouter(Inference):
        logger.debug(
            f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}",
        )
-        model_obj = await self._get_model(model, ModelType.llm)
+        model_obj = await self.routing_table.get_model(model)
+        if model_obj is None:
+            raise ModelNotFoundError(model)
+        if model_obj.model_type == ModelType.embedding:
+            raise ValueError(f"Model '{model}' is an embedding model and does not support completions")
+
        params = dict(
            model=model_obj.identifier,
            prompt=prompt,
@ -415,29 +457,9 @@ class InferenceRouter(Inference):
            prompt_logprobs=prompt_logprobs,
            suffix=suffix,
        )
+
        provider = await self.routing_table.get_provider_impl(model_obj.identifier)
-        if stream:
-            return await provider.openai_completion(**params)
-            # TODO: Metrics do NOT work with openai_completion stream=True due to the fact
-            # that we do not return an AsyncIterator, our tests expect a stream of chunks we cannot intercept currently.
-            # response_stream = await provider.openai_completion(**params)
-
-        response = await provider.openai_completion(**params)
-        if self.telemetry:
-            metrics = self._construct_metrics(
-                prompt_tokens=response.usage.prompt_tokens,
-                completion_tokens=response.usage.completion_tokens,
-                total_tokens=response.usage.total_tokens,
-                model=model_obj,
-            )
-            for metric in metrics:
-                await self.telemetry.log_event(metric)
-
-            # these metrics will show up in the client response.
-            response.metrics = (
-                metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
-            )
-        return response
+        return await provider.openai_completion(**params)

    async def openai_chat_completion(
        self,
@ -468,7 +490,11 @@ class InferenceRouter(Inference):
        logger.debug(
            f"InferenceRouter.openai_chat_completion: {model=}, {stream=}, {messages=}",
        )
-        model_obj = await self._get_model(model, ModelType.llm)
+        model_obj = await self.routing_table.get_model(model)
+        if model_obj is None:
+            raise ModelNotFoundError(model)
+        if model_obj.model_type == ModelType.embedding:
+            raise ValueError(f"Model '{model}' is an embedding model and does not support chat completions")

        # Use the OpenAI client for a bit of extra input validation without
        # exposing the OpenAI client itself as part of our API surface
@ -511,38 +537,18 @@ class InferenceRouter(Inference):
            top_p=top_p,
            user=user,
        )
+
        provider = await self.routing_table.get_provider_impl(model_obj.identifier)
        if stream:
            response_stream = await provider.openai_chat_completion(**params)
-
-            # For streaming, the provider returns AsyncIterator[OpenAIChatCompletionChunk]
-            # We need to add metrics to each chunk and store the final completion
-            return self.stream_tokens_and_compute_metrics_openai_chat(
-                response=response_stream,
-                model=model_obj,
-                messages=messages,
-            )
-
-        response = await self._nonstream_openai_chat_completion(provider, params)
-
-        # Store the response with the ID that will be returned to the client
-        if self.store:
-            await self.store.store_chat_completion(response, messages)
-
-        if self.telemetry:
-            metrics = self._construct_metrics(
-                prompt_tokens=response.usage.prompt_tokens,
-                completion_tokens=response.usage.completion_tokens,
-                total_tokens=response.usage.total_tokens,
-                model=model_obj,
-            )
-            for metric in metrics:
-                await self.telemetry.log_event(metric)
-            # these metrics will show up in the client response.
-            response.metrics = (
-                metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
-            )
-        return response
+            if self.store:
+                return stream_and_store_openai_completion(response_stream, model, self.store, messages)
+            return response_stream
+        else:
+            response = await self._nonstream_openai_chat_completion(provider, params)
+            if self.store:
+                await self.store.store_chat_completion(response, messages)
+            return response

    async def openai_embeddings(
        self,
@ -555,7 +561,12 @@ class InferenceRouter(Inference):
        logger.debug(
            f"InferenceRouter.openai_embeddings: {model=}, input_type={type(input)}, {encoding_format=}, {dimensions=}",
        )
-        model_obj = await self._get_model(model, ModelType.embedding)
+        model_obj = await self.routing_table.get_model(model)
+        if model_obj is None:
+            raise ModelNotFoundError(model)
+        if model_obj.model_type != ModelType.embedding:
+            raise ValueError(f"Model '{model}' is not an embedding model")
+
        params = dict(
            model=model_obj.identifier,
            input=input,
@ -614,245 +625,3 @@ class InferenceRouter(Inference):
                    status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}"
                )
        return health_statuses
-
-    async def stream_tokens_and_compute_metrics(
-        self,
-        response,
-        prompt_tokens,
-        model,
-        tool_prompt_format: ToolPromptFormat | None = None,
-    ) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None] | AsyncGenerator[CompletionResponseStreamChunk, None]:
-        completion_text = ""
-        async for chunk in response:
-            complete = False
-            if hasattr(chunk, "event"):  # only ChatCompletions have .event
-                if chunk.event.event_type == ChatCompletionResponseEventType.progress:
-                    if chunk.event.delta.type == "text":
-                        completion_text += chunk.event.delta.text
-                if chunk.event.event_type == ChatCompletionResponseEventType.complete:
-                    complete = True
-                    completion_tokens = await self._count_tokens(
-                        [
-                            CompletionMessage(
-                                content=completion_text,
-                                stop_reason=StopReason.end_of_turn,
-                            )
-                        ],
-                        tool_prompt_format=tool_prompt_format,
-                    )
-            else:
-                if hasattr(chunk, "delta"):
-                    completion_text += chunk.delta
-                if hasattr(chunk, "stop_reason") and chunk.stop_reason and self.telemetry:
-                    complete = True
-                    completion_tokens = await self._count_tokens(completion_text)
-            # if we are done receiving tokens
-            if complete:
-                total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
-
-                # Create a separate span for streaming completion metrics
-                if self.telemetry:
-                    # Log metrics in the new span context
-                    completion_metrics = self._construct_metrics(
-                        prompt_tokens=prompt_tokens,
-                        completion_tokens=completion_tokens,
-                        total_tokens=total_tokens,
-                        model=model,
-                    )
-                    for metric in completion_metrics:
-                        if metric.metric in [
-                            "completion_tokens",
-                            "total_tokens",
-                        ]:  # Only log completion and total tokens
-                            await self.telemetry.log_event(metric)
-
-                        # Return metrics in response
-                        async_metrics = [
-                            MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics
-                        ]
-                        chunk.metrics = async_metrics if chunk.metrics is None else chunk.metrics + async_metrics
-                else:
-                    # Fallback if no telemetry
-                    completion_metrics = self._construct_metrics(
-                        prompt_tokens or 0,
-                        completion_tokens or 0,
-                        total_tokens,
-                        model,
-                    )
-                    async_metrics = [
-                        MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics
-                    ]
-                    chunk.metrics = async_metrics if chunk.metrics is None else chunk.metrics + async_metrics
-            yield chunk
-
-    async def count_tokens_and_compute_metrics(
-        self,
-        response: ChatCompletionResponse | CompletionResponse,
-        prompt_tokens,
-        model,
-        tool_prompt_format: ToolPromptFormat | None = None,
-    ):
-        if isinstance(response, ChatCompletionResponse):
-            content = [response.completion_message]
-        else:
-            content = response.content
-        completion_tokens = await self._count_tokens(messages=content, tool_prompt_format=tool_prompt_format)
-        total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
-
-        # Create a separate span for completion metrics
-        if self.telemetry:
-            # Log metrics in the new span context
-            completion_metrics = self._construct_metrics(
-                prompt_tokens=prompt_tokens,
-                completion_tokens=completion_tokens,
-                total_tokens=total_tokens,
-                model=model,
-            )
-            for metric in completion_metrics:
-                if metric.metric in ["completion_tokens", "total_tokens"]:  # Only log completion and total tokens
-                    await self.telemetry.log_event(metric)
-
-            # Return metrics in response
-            return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics]
-
-        # Fallback if no telemetry
-        metrics = self._construct_metrics(
-            prompt_tokens or 0,
-            completion_tokens or 0,
-            total_tokens,
-            model,
-        )
-        return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
-
-    async def stream_tokens_and_compute_metrics_openai_chat(
-        self,
-        response: AsyncIterator[OpenAIChatCompletionChunk],
-        model: Model,
-        messages: list[OpenAIMessageParam] | None = None,
-    ) -> AsyncIterator[OpenAIChatCompletionChunk]:
-        """Stream OpenAI chat completion chunks, compute metrics, and store the final completion."""
-        id = None
-        created = None
-        choices_data: dict[int, dict[str, Any]] = {}
-
-        try:
-            async for chunk in response:
-                # Skip None chunks
-                if chunk is None:
-                    continue
-
-                # Capture ID and created timestamp from first chunk
-                if id is None and chunk.id:
-                    id = chunk.id
-                if created is None and chunk.created:
-                    created = chunk.created
-
-                # Accumulate choice data for final assembly
-                if chunk.choices:
-                    for choice_delta in chunk.choices:
-                        idx = choice_delta.index
-                        if idx not in choices_data:
-                            choices_data[idx] = {
-                                "content_parts": [],
-                                "tool_calls_builder": {},
-                                "finish_reason": None,
-                                "logprobs_content_parts": [],
-                            }
-                        current_choice_data = choices_data[idx]
-
-                        if choice_delta.delta:
-                            delta = choice_delta.delta
-                            if delta.content:
-                                current_choice_data["content_parts"].append(delta.content)
-                            if delta.tool_calls:
-                                for tool_call_delta in delta.tool_calls:
-                                    tc_idx = tool_call_delta.index
-                                    if tc_idx not in current_choice_data["tool_calls_builder"]:
-                                        current_choice_data["tool_calls_builder"][tc_idx] = {
-                                            "id": None,
-                                            "type": "function",
-                                            "function_name_parts": [],
-                                            "function_arguments_parts": [],
-                                        }
-                                    builder = current_choice_data["tool_calls_builder"][tc_idx]
-                                    if tool_call_delta.id:
-                                        builder["id"] = tool_call_delta.id
-                                    if tool_call_delta.type:
-                                        builder["type"] = tool_call_delta.type
-                                    if tool_call_delta.function:
-                                        if tool_call_delta.function.name:
-                                            builder["function_name_parts"].append(tool_call_delta.function.name)
-                                        if tool_call_delta.function.arguments:
-                                            builder["function_arguments_parts"].append(
-                                                tool_call_delta.function.arguments
-                                            )
-                        if choice_delta.finish_reason:
-                            current_choice_data["finish_reason"] = choice_delta.finish_reason
-                        if choice_delta.logprobs and choice_delta.logprobs.content:
-                            current_choice_data["logprobs_content_parts"].extend(choice_delta.logprobs.content)
-
-                # Compute metrics on final chunk
-                if chunk.choices and chunk.choices[0].finish_reason:
-                    completion_text = ""
-                    for choice_data in choices_data.values():
-                        completion_text += "".join(choice_data["content_parts"])
-
-                    # Add metrics to the chunk
-                    if self.telemetry and chunk.usage:
-                        metrics = self._construct_metrics(
-                            prompt_tokens=chunk.usage.prompt_tokens,
-                            completion_tokens=chunk.usage.completion_tokens,
-                            total_tokens=chunk.usage.total_tokens,
-                            model=model,
-                        )
-                        for metric in metrics:
-                            await self.telemetry.log_event(metric)
-
-                yield chunk
-        finally:
-            # Store the final assembled completion
-            if id and self.store and messages:
-                assembled_choices: list[OpenAIChoice] = []
-                for choice_idx, choice_data in choices_data.items():
-                    content_str = "".join(choice_data["content_parts"])
-                    assembled_tool_calls: list[OpenAIChatCompletionToolCall] = []
-                    if choice_data["tool_calls_builder"]:
-                        for tc_build_data in choice_data["tool_calls_builder"].values():
-                            if tc_build_data["id"]:
-                                func_name = "".join(tc_build_data["function_name_parts"])
-                                func_args = "".join(tc_build_data["function_arguments_parts"])
-                                assembled_tool_calls.append(
-                                    OpenAIChatCompletionToolCall(
-                                        id=tc_build_data["id"],
-                                        type=tc_build_data["type"],
-                                        function=OpenAIChatCompletionToolCallFunction(
-                                            name=func_name, arguments=func_args
-                                        ),
-                                    )
-                                )
-                    message = OpenAIAssistantMessageParam(
-                        role="assistant",
-                        content=content_str if content_str else None,
-                        tool_calls=assembled_tool_calls if assembled_tool_calls else None,
-                    )
-                    logprobs_content = choice_data["logprobs_content_parts"]
-                    final_logprobs = OpenAIChoiceLogprobs(content=logprobs_content) if logprobs_content else None
-
-                    assembled_choices.append(
-                        OpenAIChoice(
-                            finish_reason=choice_data["finish_reason"],
-                            index=choice_idx,
-                            message=message,
-                            logprobs=final_logprobs,
-                        )
-                    )
-
-                final_response = OpenAIChatCompletion(
-                    id=id,
-                    choices=assembled_choices,
-                    created=created or int(time.time()),
-                    model=model.identifier,
-                    object="chat.completion",
-                )
-                logger.debug(f"InferenceRouter.completion_response: {final_response}")
-                await self.store.store_chat_completion(final_response, messages)
--- a/Show more
+++ b/Show more