Merge branch 'main' into fix-vector

2025-12-05 10:23:44 +00:00 · 2025-11-04 11:17:41 -05:00 · 2025-11-04 11:17:41 -05:00 · 60b3ac7f10
commit 60b3ac7f10
parent 1f35219bb2 a6ddbae0ed
508 changed files with 101100 additions and 82743 deletions
--- a/.github/actions/install-llama-stack-client/action.yml
+++ b/.github/actions/install-llama-stack-client/action.yml
@ -0,0 +1,60 @@
+name: Install llama-stack-client
+description: Install llama-stack-client based on branch context and client-version input
+
+inputs:
+  client-version:
+    description: 'Client version to install on non-release branches (latest or published). Ignored on release branches.'
+    required: false
+    default: ""
+
+outputs:
+  uv-extra-index-url:
+    description: 'UV_EXTRA_INDEX_URL to use (set for release branches)'
+    value: ${{ steps.configure.outputs.uv-extra-index-url }}
+  install-after-sync:
+    description: 'Whether to install client after uv sync'
+    value: ${{ steps.configure.outputs.install-after-sync }}
+  install-source:
+    description: 'Where to install client from after sync'
+    value: ${{ steps.configure.outputs.install-source }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: Configure client installation
+      id: configure
+      shell: bash
+      run: |
+        # Determine the branch we're working with
+        BRANCH="${{ github.base_ref || github.ref }}"
+        BRANCH="${BRANCH#refs/heads/}"
+
+        echo "Working with branch: $BRANCH"
+
+        # On release branches: use test.pypi for uv sync, then install from git
+        # On non-release branches: install based on client-version after sync
+        if [[ "$BRANCH" =~ ^release-[0-9]+\.[0-9]+\.x$ ]]; then
+          echo "Detected release branch: $BRANCH"
+
+          # Check if matching branch exists in client repo
+          if ! git ls-remote --exit-code --heads https://github.com/llamastack/llama-stack-client-python.git "$BRANCH" > /dev/null 2>&1; then
+            echo "::error::Branch $BRANCH not found in llama-stack-client-python repository"
+            echo "::error::Please create the matching release branch in llama-stack-client-python before testing"
+            exit 1
+          fi
+
+          # Configure to use test.pypi as extra index (PyPI is primary)
+          echo "uv-extra-index-url=https://test.pypi.org/simple/" >> $GITHUB_OUTPUT
+          echo "install-after-sync=true" >> $GITHUB_OUTPUT
+          echo "install-source=git+https://github.com/llamastack/llama-stack-client-python.git@$BRANCH" >> $GITHUB_OUTPUT
+        elif [ "${{ inputs.client-version }}" = "latest" ]; then
+          # Install from main git after sync
+          echo "install-after-sync=true" >> $GITHUB_OUTPUT
+          echo "install-source=git+https://github.com/llamastack/llama-stack-client-python.git@main" >> $GITHUB_OUTPUT
+        elif [ "${{ inputs.client-version }}" = "published" ]; then
+          # Use published version from PyPI (installed by sync)
+          echo "install-after-sync=false" >> $GITHUB_OUTPUT
+        elif [ -n "${{ inputs.client-version }}" ]; then
+          echo "::error::Invalid client-version: ${{ inputs.client-version }}"
+          exit 1
+        fi
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -94,7 +94,7 @@ runs:
      if: ${{ always() }}
      uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
      with:
-        name: logs-${{ github.run_id }}-${{ github.run_attempt || '' }}-${{ strategy.job-index }}
+        name: logs-${{ github.run_id }}-${{ github.run_attempt || '1' }}-${{ strategy.job-index || github.job }}-${{ github.action }}
        path: |
          *.log
        retention-days: 1
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -18,25 +18,35 @@ runs:
        python-version: ${{ inputs.python-version }}
        version: 0.7.6

+    - name: Configure client installation
+      id: client-config
+      uses: ./.github/actions/install-llama-stack-client
+      with:
+        client-version: ${{ inputs.client-version }}
+
    - name: Install dependencies
      shell: bash
+      env:
+        UV_EXTRA_INDEX_URL: ${{ steps.client-config.outputs.uv-extra-index-url }}
      run: |
+        # Export UV env vars for current step and persist to GITHUB_ENV for subsequent steps
+        if [ -n "$UV_EXTRA_INDEX_URL" ]; then
+          export UV_INDEX_STRATEGY=unsafe-best-match
+          echo "UV_EXTRA_INDEX_URL=$UV_EXTRA_INDEX_URL" >> $GITHUB_ENV
+          echo "UV_INDEX_STRATEGY=$UV_INDEX_STRATEGY" >> $GITHUB_ENV
+          echo "Exported UV environment variables for current and subsequent steps"
+        fi
+
        echo "Updating project dependencies via uv sync"
        uv sync --all-groups

        echo "Installing ad-hoc dependencies"
        uv pip install faiss-cpu

-        # Install llama-stack-client-python based on the client-version input
-        if [ "${{ inputs.client-version }}" = "latest" ]; then
-          echo "Installing latest llama-stack-client-python from main branch"
-          uv pip install git+https://github.com/llamastack/llama-stack-client-python.git@main
-        elif [ "${{ inputs.client-version }}" = "published" ]; then
-          echo "Installing published llama-stack-client-python from PyPI"
-          uv pip install llama-stack-client
-        else
-          echo "Invalid client-version: ${{ inputs.client-version }}"
-          exit 1
+        # Install specific client version after sync if needed
+        if [ "${{ steps.client-config.outputs.install-after-sync }}" = "true" ]; then
+          echo "Installing llama-stack-client from: ${{ steps.client-config.outputs.install-source }}"
+          uv pip install ${{ steps.client-config.outputs.install-source }}
        fi

        echo "Installed llama packages"
--- a/.github/actions/setup-test-environment/action.yml
+++ b/.github/actions/setup-test-environment/action.yml
@ -42,18 +42,7 @@ runs:
    - name: Build Llama Stack
      shell: bash
      run: |
-        # Install llama-stack-client-python based on the client-version input
-        if [ "${{ inputs.client-version }}" = "latest" ]; then
-          echo "Installing latest llama-stack-client-python from main branch"
-          export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@main
-        elif [ "${{ inputs.client-version }}" = "published" ]; then
-          echo "Installing published llama-stack-client-python from PyPI"
-          unset LLAMA_STACK_CLIENT_DIR
-        else
-          echo "Invalid client-version: ${{ inputs.client-version }}"
-          exit 1
-        fi
-
+        # Client is already installed by setup-runner (handles both main and release branches)
        echo "Building Llama Stack"

        LLAMA_STACK_DIR=. \
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -0,0 +1,23 @@
+pull_request_rules:
+- name: ping author on conflicts and add 'needs-rebase' label
+  conditions:
+      - conflict
+      - -closed
+  actions:
+    label:
+      add:
+        - needs-rebase
+    comment:
+      message: >
+       This pull request has merge conflicts that must be resolved before it
+       can be merged. @{{author}} please rebase it.
+       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
+
+- name: remove 'needs-rebase' label when conflict is resolved
+  conditions:
+      - -conflict
+      - -closed
+  actions:
+    label:
+      remove:
+        - needs-rebase
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -4,6 +4,7 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl

 | Name | File | Purpose |
 | ---- | ---- | ------- |
+| Backward Compatibility Check | [backward-compat.yml](backward-compat.yml) | Check backward compatibility for run.yaml configs |
 | Update Changelog | [changelog.yml](changelog.yml) | Creates PR for updating the CHANGELOG.md |
 | API Conformance Tests | [conformance.yml](conformance.yml) | Run the API Conformance test suite on the changes. |
 | Installer CI | [install-script-ci.yml](install-script-ci.yml) | Test the installation script |
@ -12,7 +13,6 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suites from tests/integration in replay mode |
 | Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers |
 | Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
-| Pre-commit Bot | [precommit-trigger.yml](precommit-trigger.yml) | Pre-commit bot for PR |
 | Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
 | Test llama stack list-deps | [providers-list-deps.yml](providers-list-deps.yml) | Test llama stack list-deps |
 | Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
--- a/.github/workflows/backward-compat.yml
+++ b/.github/workflows/backward-compat.yml
@ -0,0 +1,578 @@
+name: Backward Compatibility Check
+
+run-name: Check backward compatibility for run.yaml configs
+
+on:
+  pull_request:
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.[0-9]+.[0-9]+'
+      - 'release-[0-9]+.[0-9]+.[0-9]+'
+      - 'release-[0-9]+.[0-9]+'
+    paths:
+      - 'src/llama_stack/core/datatypes.py'
+      - 'src/llama_stack/providers/datatypes.py'
+      - 'src/llama_stack/distributions/**/run.yaml'
+      - 'tests/backward_compat/**'
+      - '.github/workflows/backward-compat.yml'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  check-main-compatibility:
+    name: Check Compatibility with main
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout PR branch
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          fetch-depth: 0  # Need full history to access main branch
+
+      - name: Set up Python
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        with:
+          python-version: '3.12'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        run: |
+          uv sync --group dev
+
+      - name: Extract run.yaml files from main branch
+        id: extract_configs
+        run: |
+          # Get list of run.yaml paths from main
+          git fetch origin main
+          CONFIG_PATHS=$(git ls-tree -r --name-only origin/main | grep "src/llama_stack/distributions/.*/run.yaml$" || true)
+
+          if [ -z "$CONFIG_PATHS" ]; then
+            echo "No run.yaml files found in main branch"
+            exit 1
+          fi
+
+          # Extract all configs to a temp directory
+          mkdir -p /tmp/main_configs
+          echo "Extracting configs from main branch:"
+
+          while IFS= read -r config_path; do
+            if [ -z "$config_path" ]; then
+              continue
+            fi
+
+            # Extract filename for storage
+            filename=$(basename $(dirname "$config_path"))
+            echo "  - $filename (from $config_path)"
+
+            git show origin/main:"$config_path" > "/tmp/main_configs/${filename}.yaml"
+          done <<< "$CONFIG_PATHS"
+
+          echo ""
+          echo "Extracted $(ls /tmp/main_configs/*.yaml | wc -l) config files"
+
+      - name: Test all configs from main
+        id: test_configs
+        continue-on-error: true
+        run: |
+          # Run pytest once with all configs parameterized
+          if COMPAT_TEST_CONFIGS_DIR=/tmp/main_configs uv run pytest tests/backward_compat/test_run_config.py -v; then
+            echo "failed=false" >> $GITHUB_OUTPUT
+          else
+            echo "failed=true" >> $GITHUB_OUTPUT
+            exit 1
+          fi
+
+      - name: Check for breaking change acknowledgment
+        id: check_ack
+        if: steps.test_configs.outputs.failed == 'true'
+        run: |
+          echo "Breaking changes detected. Checking for acknowledgment..."
+
+          # Check PR title for '!:' marker (conventional commits)
+          PR_TITLE="${{ github.event.pull_request.title }}"
+          if [[ "$PR_TITLE" =~ ^[a-z]+\!: ]]; then
+            echo "✓ Breaking change acknowledged in PR title"
+            echo "acknowledged=true" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          # Check commit messages for BREAKING CHANGE:
+          if git log origin/main..HEAD --format=%B | grep -q "BREAKING CHANGE:"; then
+            echo "✓ Breaking change acknowledged in commit message"
+            echo "acknowledged=true" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          echo "✗ Breaking change NOT acknowledged"
+          echo "acknowledged=false" >> $GITHUB_OUTPUT
+        env:
+          GH_TOKEN: ${{ github.token }}
+
+      - name: Evaluate results
+        if: always()
+        run: |
+          FAILED="${{ steps.test_configs.outputs.failed }}"
+          ACKNOWLEDGED="${{ steps.check_ack.outputs.acknowledged }}"
+
+          if [[ "$FAILED" == "true" ]]; then
+            if [[ "$ACKNOWLEDGED" == "true" ]]; then
+              echo ""
+              echo "⚠️  WARNING: Breaking changes detected but acknowledged"
+              echo ""
+              echo "This PR introduces backward-incompatible changes to run.yaml."
+              echo "The changes have been properly acknowledged."
+              echo ""
+              exit 0  # Pass the check
+            else
+              echo ""
+              echo "❌ ERROR: Breaking changes detected without acknowledgment"
+              echo ""
+              echo "This PR introduces backward-incompatible changes to run.yaml"
+              echo "that will break existing user configurations."
+              echo ""
+              echo "To acknowledge this breaking change, do ONE of:"
+              echo "  1. Add '!:' to your PR title (e.g., 'feat!: change xyz')"
+              echo "  2. Add the 'breaking-change' label to this PR"
+              echo "  3. Include 'BREAKING CHANGE:' in a commit message"
+              echo ""
+              exit 1  # Fail the check
+            fi
+          fi
+
+  test-integration-main:
+    name: Run Integration Tests with main Config
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout PR branch
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          fetch-depth: 0
+
+      - name: Extract ci-tests run.yaml from main
+        run: |
+          git fetch origin main
+          git show origin/main:src/llama_stack/distributions/ci-tests/run.yaml > /tmp/main-ci-tests-run.yaml
+          echo "Extracted ci-tests run.yaml from main branch"
+
+      - name: Setup test environment
+        uses: ./.github/actions/setup-test-environment
+        with:
+          python-version: '3.12'
+          client-version: 'latest'
+          setup: 'ollama'
+          suite: 'base'
+          inference-mode: 'replay'
+
+      - name: Run integration tests with main config
+        id: test_integration
+        continue-on-error: true
+        uses: ./.github/actions/run-and-record-tests
+        with:
+          stack-config: /tmp/main-ci-tests-run.yaml
+          setup: 'ollama'
+          inference-mode: 'replay'
+          suite: 'base'
+
+      - name: Check for breaking change acknowledgment
+        id: check_ack
+        if: steps.test_integration.outcome == 'failure'
+        run: |
+          echo "Integration tests failed. Checking for acknowledgment..."
+
+          # Check PR title for '!:' marker (conventional commits)
+          PR_TITLE="${{ github.event.pull_request.title }}"
+          if [[ "$PR_TITLE" =~ ^[a-z]+\!: ]]; then
+            echo "✓ Breaking change acknowledged in PR title"
+            echo "acknowledged=true" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          # Check commit messages for BREAKING CHANGE:
+          if git log origin/main..HEAD --format=%B | grep -q "BREAKING CHANGE:"; then
+            echo "✓ Breaking change acknowledged in commit message"
+            echo "acknowledged=true" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          echo "✗ Breaking change NOT acknowledged"
+          echo "acknowledged=false" >> $GITHUB_OUTPUT
+        env:
+          GH_TOKEN: ${{ github.token }}
+
+      - name: Evaluate integration test results
+        if: always()
+        run: |
+          TEST_FAILED="${{ steps.test_integration.outcome == 'failure' }}"
+          ACKNOWLEDGED="${{ steps.check_ack.outputs.acknowledged }}"
+
+          if [[ "$TEST_FAILED" == "true" ]]; then
+            if [[ "$ACKNOWLEDGED" == "true" ]]; then
+              echo ""
+              echo "⚠️  WARNING: Integration tests failed with main config but acknowledged"
+              echo ""
+              exit 0  # Pass the check
+            else
+              echo ""
+              echo "❌ ERROR: Integration tests failed with main config without acknowledgment"
+              echo ""
+              echo "To acknowledge this breaking change, do ONE of:"
+              echo "  1. Add '!:' to your PR title (e.g., 'feat!: change xyz')"
+              echo "  2. Include 'BREAKING CHANGE:' in a commit message"
+              echo ""
+              exit 1  # Fail the check
+            fi
+          fi
+
+  test-integration-release:
+    name: Run Integration Tests with Latest Release (Informational)
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout PR branch
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          fetch-depth: 0
+
+      - name: Get latest release
+        id: get_release
+        run: |
+          # Get the latest release from GitHub
+          LATEST_TAG=$(gh release list --limit 1 --json tagName --jq '.[0].tagName' 2>/dev/null || echo "")
+
+          if [ -z "$LATEST_TAG" ]; then
+            echo "No releases found, skipping release compatibility check"
+            echo "has_release=false" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          echo "Latest release: $LATEST_TAG"
+          echo "has_release=true" >> $GITHUB_OUTPUT
+          echo "tag=$LATEST_TAG" >> $GITHUB_OUTPUT
+        env:
+          GH_TOKEN: ${{ github.token }}
+
+      - name: Extract ci-tests run.yaml from release
+        if: steps.get_release.outputs.has_release == 'true'
+        id: extract_config
+        run: |
+          RELEASE_TAG="${{ steps.get_release.outputs.tag }}"
+
+          # Try with src/ prefix first (newer releases), then without (older releases)
+          if git show "$RELEASE_TAG:src/llama_stack/distributions/ci-tests/run.yaml" > /tmp/release-ci-tests-run.yaml 2>/dev/null; then
+            echo "Extracted ci-tests run.yaml from release $RELEASE_TAG (src/ path)"
+            echo "has_config=true" >> $GITHUB_OUTPUT
+          elif git show "$RELEASE_TAG:llama_stack/distributions/ci-tests/run.yaml" > /tmp/release-ci-tests-run.yaml 2>/dev/null; then
+            echo "Extracted ci-tests run.yaml from release $RELEASE_TAG (old path)"
+            echo "has_config=true" >> $GITHUB_OUTPUT
+          else
+            echo "::warning::ci-tests/run.yaml not found in release $RELEASE_TAG"
+            echo "has_config=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Setup test environment
+        if: steps.get_release.outputs.has_release == 'true' && steps.extract_config.outputs.has_config == 'true'
+        uses: ./.github/actions/setup-test-environment
+        with:
+          python-version: '3.12'
+          client-version: 'latest'
+          setup: 'ollama'
+          suite: 'base'
+          inference-mode: 'replay'
+
+      - name: Run integration tests with release config (PR branch)
+        id: test_release_pr
+        if: steps.get_release.outputs.has_release == 'true' && steps.extract_config.outputs.has_config == 'true'
+        continue-on-error: true
+        uses: ./.github/actions/run-and-record-tests
+        with:
+          stack-config: /tmp/release-ci-tests-run.yaml
+          setup: 'ollama'
+          inference-mode: 'replay'
+          suite: 'base'
+
+      - name: Checkout main branch to test baseline
+        if: steps.get_release.outputs.has_release == 'true' && steps.extract_config.outputs.has_config == 'true'
+        run: |
+          git checkout origin/main
+
+      - name: Setup test environment for main
+        if: steps.get_release.outputs.has_release == 'true' && steps.extract_config.outputs.has_config == 'true'
+        uses: ./.github/actions/setup-test-environment
+        with:
+          python-version: '3.12'
+          client-version: 'latest'
+          setup: 'ollama'
+          suite: 'base'
+          inference-mode: 'replay'
+
+      - name: Run integration tests with release config (main branch)
+        id: test_release_main
+        if: steps.get_release.outputs.has_release == 'true' && steps.extract_config.outputs.has_config == 'true'
+        continue-on-error: true
+        uses: ./.github/actions/run-and-record-tests
+        with:
+          stack-config: /tmp/release-ci-tests-run.yaml
+          setup: 'ollama'
+          inference-mode: 'replay'
+          suite: 'base'
+
+      - name: Report results and post PR comment
+        if: always() && steps.get_release.outputs.has_release == 'true' && steps.extract_config.outputs.has_config == 'true'
+        run: |
+          RELEASE_TAG="${{ steps.get_release.outputs.tag }}"
+          PR_OUTCOME="${{ steps.test_release_pr.outcome }}"
+          MAIN_OUTCOME="${{ steps.test_release_main.outcome }}"
+
+          if [[ "$PR_OUTCOME" == "failure" && "$MAIN_OUTCOME" == "success" ]]; then
+            # NEW breaking change - PR fails but main passes
+            echo "::error::🚨 This PR introduces a NEW breaking change!"
+
+            # Check if we already posted a comment (to avoid spam on every push)
+            EXISTING_COMMENT=$(gh pr view ${{ github.event.pull_request.number }} --json comments --jq '.comments[] | select(.body | contains("🚨 New Breaking Change Detected") and contains("Integration tests")) | .id' | head -1)
+
+            if [[ -z "$EXISTING_COMMENT" ]]; then
+              gh pr comment ${{ github.event.pull_request.number }} --body "## 🚨 New Breaking Change Detected
+
+          **Integration tests against release \`$RELEASE_TAG\` are now failing**
+
+          ⚠️  This PR introduces a breaking change that affects compatibility with the latest release.
+
+          - Users on release \`$RELEASE_TAG\` may not be able to upgrade
+          - Existing configurations may break
+
+          The tests pass on \`main\` but fail with this PR's changes.
+
+          > **Note:** This is informational only and does not block merge.
+          > Consider whether this breaking change is acceptable for users."
+            else
+              echo "Comment already exists, skipping to avoid spam"
+            fi
+
+            cat >> $GITHUB_STEP_SUMMARY <<EOF
+          ## 🚨 NEW Breaking Change Detected
+
+          **Integration tests against release \`$RELEASE_TAG\` FAILED**
+
+          ⚠️  **This PR introduces a NEW breaking change**
+
+          - Tests **PASS** on main branch ✅
+          - Tests **FAIL** on PR branch ❌
+          - Users on release \`$RELEASE_TAG\` may not be able to upgrade
+          - Existing configurations may break
+
+          > **Note:** This is informational only and does not block merge.
+          > Consider whether this breaking change is acceptable for users.
+          EOF
+
+          elif [[ "$PR_OUTCOME" == "failure" ]]; then
+            # Existing breaking change - both PR and main fail
+            echo "::warning::Breaking change already exists in main branch"
+
+            cat >> $GITHUB_STEP_SUMMARY <<EOF
+          ## ⚠️ Release Compatibility Test Failed (Existing Issue)
+
+          **Integration tests against release \`$RELEASE_TAG\` FAILED**
+
+          - Tests **FAIL** on main branch ❌
+          - Tests **FAIL** on PR branch ❌
+          - This breaking change already exists in main (not introduced by this PR)
+
+          > **Note:** This is informational only.
+          EOF
+
+          else
+            # Success - tests pass
+            cat >> $GITHUB_STEP_SUMMARY <<EOF
+          ## ✅ Release Compatibility Test Passed
+
+          Integration tests against release \`$RELEASE_TAG\` passed successfully.
+          This PR maintains compatibility with the latest release.
+          EOF
+          fi
+        env:
+          GH_TOKEN: ${{ github.token }}
+
+  check-schema-release-compatibility:
+    name: Check Schema Compatibility with Latest Release (Informational)
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout PR branch
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        with:
+          python-version: '3.12'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        run: |
+          uv sync --group dev
+
+      - name: Get latest release
+        id: get_release
+        run: |
+          # Get the latest release from GitHub
+          LATEST_TAG=$(gh release list --limit 1 --json tagName --jq '.[0].tagName' 2>/dev/null || echo "")
+
+          if [ -z "$LATEST_TAG" ]; then
+            echo "No releases found, skipping release compatibility check"
+            echo "has_release=false" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          echo "Latest release: $LATEST_TAG"
+          echo "has_release=true" >> $GITHUB_OUTPUT
+          echo "tag=$LATEST_TAG" >> $GITHUB_OUTPUT
+        env:
+          GH_TOKEN: ${{ github.token }}
+
+      - name: Extract configs from release
+        if: steps.get_release.outputs.has_release == 'true'
+        id: extract_release_configs
+        run: |
+          RELEASE_TAG="${{ steps.get_release.outputs.tag }}"
+
+          # Get run.yaml files from the release (try both src/ and old path)
+          CONFIG_PATHS=$(git ls-tree -r --name-only "$RELEASE_TAG" | grep "llama_stack/distributions/.*/run.yaml$" || true)
+
+          if [ -z "$CONFIG_PATHS" ]; then
+            echo "::warning::No run.yaml files found in release $RELEASE_TAG"
+            echo "has_configs=false" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          # Extract all configs to a temp directory
+          mkdir -p /tmp/release_configs
+          echo "Extracting configs from release $RELEASE_TAG:"
+
+          while IFS= read -r config_path; do
+            if [ -z "$config_path" ]; then
+              continue
+            fi
+
+            filename=$(basename $(dirname "$config_path"))
+            echo "  - $filename (from $config_path)"
+
+            git show "$RELEASE_TAG:$config_path" > "/tmp/release_configs/${filename}.yaml" 2>/dev/null || true
+          done <<< "$CONFIG_PATHS"
+
+          echo ""
+          echo "Extracted $(ls /tmp/release_configs/*.yaml 2>/dev/null | wc -l) config files"
+          echo "has_configs=true" >> $GITHUB_OUTPUT
+
+      - name: Test against release configs (PR branch)
+        id: test_schema_pr
+        if: steps.get_release.outputs.has_release == 'true' && steps.extract_release_configs.outputs.has_configs == 'true'
+        continue-on-error: true
+        run: |
+          RELEASE_TAG="${{ steps.get_release.outputs.tag }}"
+          COMPAT_TEST_CONFIGS_DIR=/tmp/release_configs uv run pytest tests/backward_compat/test_run_config.py -v --tb=short
+
+      - name: Checkout main branch to test baseline
+        if: steps.get_release.outputs.has_release == 'true' && steps.extract_release_configs.outputs.has_configs == 'true'
+        run: |
+          git checkout origin/main
+
+      - name: Install dependencies for main
+        if: steps.get_release.outputs.has_release == 'true' && steps.extract_release_configs.outputs.has_configs == 'true'
+        run: |
+          uv sync --group dev
+
+      - name: Test against release configs (main branch)
+        id: test_schema_main
+        if: steps.get_release.outputs.has_release == 'true' && steps.extract_release_configs.outputs.has_configs == 'true'
+        continue-on-error: true
+        run: |
+          RELEASE_TAG="${{ steps.get_release.outputs.tag }}"
+          COMPAT_TEST_CONFIGS_DIR=/tmp/release_configs uv run pytest tests/backward_compat/test_run_config.py -v --tb=short
+
+      - name: Report results and post PR comment
+        if: always() && steps.get_release.outputs.has_release == 'true' && steps.extract_release_configs.outputs.has_configs == 'true'
+        run: |
+          RELEASE_TAG="${{ steps.get_release.outputs.tag }}"
+          PR_OUTCOME="${{ steps.test_schema_pr.outcome }}"
+          MAIN_OUTCOME="${{ steps.test_schema_main.outcome }}"
+
+          if [[ "$PR_OUTCOME" == "failure" && "$MAIN_OUTCOME" == "success" ]]; then
+            # NEW breaking change - PR fails but main passes
+            echo "::error::🚨 This PR introduces a NEW schema breaking change!"
+
+            # Check if we already posted a comment (to avoid spam on every push)
+            EXISTING_COMMENT=$(gh pr view ${{ github.event.pull_request.number }} --json comments --jq '.comments[] | select(.body | contains("🚨 New Schema Breaking Change Detected")) | .id' | head -1)
+
+            if [[ -z "$EXISTING_COMMENT" ]]; then
+              gh pr comment ${{ github.event.pull_request.number }} --body "## 🚨 New Schema Breaking Change Detected
+
+          **Schema validation against release \`$RELEASE_TAG\` is now failing**
+
+          ⚠️  This PR introduces a schema breaking change that affects compatibility with the latest release.
+
+          - Users on release \`$RELEASE_TAG\` will not be able to upgrade
+          - Existing run.yaml configurations will fail validation
+
+          The tests pass on \`main\` but fail with this PR's changes.
+
+          > **Note:** This is informational only and does not block merge.
+          > Consider whether this breaking change is acceptable for users."
+            else
+              echo "Comment already exists, skipping to avoid spam"
+            fi
+
+            cat >> $GITHUB_STEP_SUMMARY <<EOF
+          ## 🚨 NEW Schema Breaking Change Detected
+
+          **Schema validation against release \`$RELEASE_TAG\` FAILED**
+
+          ⚠️  **This PR introduces a NEW schema breaking change**
+
+          - Tests **PASS** on main branch ✅
+          - Tests **FAIL** on PR branch ❌
+          - Users on release \`$RELEASE_TAG\` will not be able to upgrade
+          - Existing run.yaml configurations will fail validation
+
+          > **Note:** This is informational only and does not block merge.
+          > Consider whether this breaking change is acceptable for users.
+          EOF
+
+          elif [[ "$PR_OUTCOME" == "failure" ]]; then
+            # Existing breaking change - both PR and main fail
+            echo "::warning::Schema breaking change already exists in main branch"
+
+            cat >> $GITHUB_STEP_SUMMARY <<EOF
+          ## ⚠️ Release Schema Compatibility Failed (Existing Issue)
+
+          **Schema validation against release \`$RELEASE_TAG\` FAILED**
+
+          - Tests **FAIL** on main branch ❌
+          - Tests **FAIL** on PR branch ❌
+          - This schema breaking change already exists in main (not introduced by this PR)
+
+          > **Note:** This is informational only.
+          EOF
+
+          else
+            # Success - tests pass
+            cat >> $GITHUB_STEP_SUMMARY <<EOF
+          ## ✅ Release Schema Compatibility Passed
+
+          All run.yaml configs from release \`$RELEASE_TAG\` are compatible.
+          This PR maintains backward compatibility with the latest release.
+          EOF
+          fi
+        env:
+          GH_TOKEN: ${{ github.token }}
--- a/.github/workflows/conformance.yml
+++ b/.github/workflows/conformance.yml
@ -22,7 +22,6 @@ on:
      - 'docs/static/stable-llama-stack-spec.yaml'       # Stable APIs spec
      - 'docs/static/experimental-llama-stack-spec.yaml' # Experimental APIs spec
      - 'docs/static/deprecated-llama-stack-spec.yaml'   # Deprecated APIs spec
-      - 'docs/static/llama-stack-spec.html'              # Legacy HTML spec
      - '.github/workflows/conformance.yml'              # This workflow itself

 concurrency:
--- a/.github/workflows/install-script-ci.yml
+++ b/.github/workflows/install-script-ci.yml
@ -30,10 +30,16 @@ jobs:

      - name: Build a single provider
        run: |
+          BUILD_ARGS="--build-arg INSTALL_MODE=editable --build-arg DISTRO_NAME=starter"
+          if [ -n "${UV_EXTRA_INDEX_URL:-}" ]; then
+            BUILD_ARGS="$BUILD_ARGS --build-arg UV_EXTRA_INDEX_URL=$UV_EXTRA_INDEX_URL"
+          fi
+          if [ -n "${UV_INDEX_STRATEGY:-}" ]; then
+            BUILD_ARGS="$BUILD_ARGS --build-arg UV_INDEX_STRATEGY=$UV_INDEX_STRATEGY"
+          fi
          docker build . \
            -f containers/Containerfile \
-            --build-arg INSTALL_MODE=editable \
-            --build-arg DISTRO_NAME=starter \
+            $BUILD_ARGS \
            --tag llama-stack:starter-ci

      - name: Run installer end-to-end
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -4,9 +4,13 @@ run-name: Run the integration test suite with Kubernetes authentication

 on:
  push:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
  pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
    paths:
      - 'distributions/**'
      - 'src/llama_stack/**'
--- a/.github/workflows/integration-sql-store-tests.yml
+++ b/.github/workflows/integration-sql-store-tests.yml
@ -4,9 +4,13 @@ run-name: Run the integration test suite with SqlStore

 on:
  push:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
  pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
    paths:
      - 'src/llama_stack/providers/utils/sqlstore/**'
      - 'tests/integration/sqlstore/**'
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -4,9 +4,13 @@ run-name: Run the integration test suites from tests/integration in replay mode

 on:
  push:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
  pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
    types: [opened, synchronize, reopened]
    paths:
      - 'src/llama_stack/**'
@ -18,6 +22,7 @@ on:
      - '.github/actions/setup-ollama/action.yml'
      - '.github/actions/setup-test-environment/action.yml'
      - '.github/actions/run-and-record-tests/action.yml'
+      - 'scripts/integration-tests.sh'
  schedule:
    # If changing the cron schedule, update the provider in the test-matrix job
    - cron: '0 0 * * *'  # (test latest client) Daily at 12 AM UTC
@ -47,7 +52,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        client-type: [library, docker]
+        client-type: [library, docker, server]
        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
--- a/.github/workflows/integration-vector-io-tests.yml
+++ b/.github/workflows/integration-vector-io-tests.yml
@ -4,9 +4,13 @@ run-name: Run the integration test suite with various VectorIO providers

 on:
  push:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
  pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
    paths:
      - 'src/llama_stack/**'
      - '!src/llama_stack/ui/**'
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -5,7 +5,9 @@ run-name: Run pre-commit checks
 on:
  pull_request:
  push:
-    branches: [main]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
@ -43,23 +45,41 @@ jobs:
          cache: 'npm'
          cache-dependency-path: 'src/llama_stack/ui/'

+      - name: Set up uv
+        uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2
+
      - name: Install npm dependencies
        run: npm ci
        working-directory: src/llama_stack/ui

+      - name: Install pre-commit
+        run: python -m pip install pre-commit
+
+      - name: Cache pre-commit
+        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4
+        with:
+          path: ~/.cache/pre-commit
+          key: pre-commit-3|${{ env.pythonLocation }}|${{ hashFiles('.pre-commit-config.yaml') }}
+
      - name: Run pre-commit
        id: precommit
-        uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
-        continue-on-error: true
+        run: |
+          set +e
+          pre-commit run --show-diff-on-failure --color=always --all-files 2>&1 | tee /tmp/precommit.log
+          status=${PIPESTATUS[0]}
+          echo "status=$status" >> $GITHUB_OUTPUT
+          exit 0
        env:
-          SKIP: no-commit-to-branch
+          SKIP: no-commit-to-branch,mypy
          RUFF_OUTPUT_FORMAT: github

      - name: Check pre-commit results
-        if: steps.precommit.outcome == 'failure'
+        if: steps.precommit.outputs.status != '0'
        run: |
          echo "::error::Pre-commit hooks failed. Please run 'pre-commit run --all-files' locally and commit the fixes."
-          echo "::warning::Some pre-commit hooks failed. Check the output above for details."
+          echo ""
+          echo "Failed hooks output:"
+          cat /tmp/precommit.log
          exit 1

      - name: Debug
@ -109,3 +129,39 @@ jobs:
            echo "$unstaged_files"
            exit 1
          fi
+
+      - name: Configure client installation
+        id: client-config
+        uses: ./.github/actions/install-llama-stack-client
+
+      - name: Sync dev + type_checking dependencies
+        env:
+          UV_EXTRA_INDEX_URL: ${{ steps.client-config.outputs.uv-extra-index-url }}
+        run: |
+          if [ -n "$UV_EXTRA_INDEX_URL" ]; then
+            export UV_INDEX_STRATEGY="unsafe-best-match"
+          fi
+
+          uv sync --group dev --group type_checking
+
+          # Install specific client version after sync if needed
+          if [ "${{ steps.client-config.outputs.install-after-sync }}" = "true" ]; then
+            echo "Installing llama-stack-client from: ${{ steps.client-config.outputs.install-source }}"
+            uv pip install ${{ steps.client-config.outputs.install-source }}
+          fi
+
+      - name: Run mypy (full type_checking)
+        env:
+          UV_EXTRA_INDEX_URL: ${{ steps.client-config.outputs.uv-extra-index-url }}
+        run: |
+          if [ -n "$UV_EXTRA_INDEX_URL" ]; then
+            export UV_INDEX_STRATEGY="unsafe-best-match"
+          fi
+
+          set +e
+          uv run --group dev --group type_checking mypy
+          status=$?
+          if [ $status -ne 0 ]; then
+            echo "::error::Full mypy failed. Reproduce locally with 'uv run pre-commit run mypy-full --hook-stage manual --all-files'."
+          fi
+          exit $status
--- a/.github/workflows/precommit-trigger.yml
+++ b/.github/workflows/precommit-trigger.yml
@ -1,227 +0,0 @@
-name: Pre-commit Bot
-
-run-name: Pre-commit bot for PR #${{ github.event.issue.number }}
-
-on:
-  issue_comment:
-    types: [created]
-
-jobs:
-  pre-commit:
-    # Only run on pull request comments
-    if: github.event.issue.pull_request && contains(github.event.comment.body, '@github-actions run precommit')
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-      pull-requests: write
-
-    steps:
-      - name: Check comment author and get PR details
-        id: check_author
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            // Get PR details
-            const pr = await github.rest.pulls.get({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              pull_number: context.issue.number
-            });
-
-            // Check if commenter has write access or is the PR author
-            const commenter = context.payload.comment.user.login;
-            const prAuthor = pr.data.user.login;
-
-            let hasPermission = false;
-
-            // Check if commenter is PR author
-            if (commenter === prAuthor) {
-              hasPermission = true;
-              console.log(`Comment author ${commenter} is the PR author`);
-            } else {
-              // Check if commenter has write/admin access
-              try {
-                const permission = await github.rest.repos.getCollaboratorPermissionLevel({
-                  owner: context.repo.owner,
-                  repo: context.repo.repo,
-                  username: commenter
-                });
-
-                const level = permission.data.permission;
-                hasPermission = ['write', 'admin', 'maintain'].includes(level);
-                console.log(`Comment author ${commenter} has permission: ${level}`);
-              } catch (error) {
-                console.log(`Could not check permissions for ${commenter}: ${error.message}`);
-              }
-            }
-
-            if (!hasPermission) {
-              await github.rest.issues.createComment({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                issue_number: context.issue.number,
-                body: `❌ @${commenter} You don't have permission to trigger pre-commit. Only PR authors or repository collaborators can run this command.`
-              });
-              core.setFailed(`User ${commenter} does not have permission`);
-              return;
-            }
-
-            // Save PR info for later steps
-            core.setOutput('pr_number', context.issue.number);
-            core.setOutput('pr_head_ref', pr.data.head.ref);
-            core.setOutput('pr_head_sha', pr.data.head.sha);
-            core.setOutput('pr_head_repo', pr.data.head.repo.full_name);
-            core.setOutput('pr_base_ref', pr.data.base.ref);
-            core.setOutput('is_fork', pr.data.head.repo.full_name !== context.payload.repository.full_name);
-            core.setOutput('authorized', 'true');
-
-      - name: React to comment
-        if: steps.check_author.outputs.authorized == 'true'
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            await github.rest.reactions.createForIssueComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              comment_id: context.payload.comment.id,
-              content: 'rocket'
-            });
-
-      - name: Comment starting
-        if: steps.check_author.outputs.authorized == 'true'
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            await github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: ${{ steps.check_author.outputs.pr_number }},
-              body: `⏳ Running [pre-commit hooks](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}) on PR #${{ steps.check_author.outputs.pr_number }}...`
-            });
-
-      - name: Checkout PR branch (same-repo)
-        if: steps.check_author.outputs.authorized == 'true' && steps.check_author.outputs.is_fork == 'false'
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-        with:
-          ref: ${{ steps.check_author.outputs.pr_head_ref }}
-          fetch-depth: 0
-          token: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Checkout PR branch (fork)
-        if: steps.check_author.outputs.authorized == 'true' && steps.check_author.outputs.is_fork == 'true'
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-        with:
-          repository: ${{ steps.check_author.outputs.pr_head_repo }}
-          ref: ${{ steps.check_author.outputs.pr_head_ref }}
-          fetch-depth: 0
-          token: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Verify checkout
-        if: steps.check_author.outputs.authorized == 'true'
-        run: |
-          echo "Current SHA: $(git rev-parse HEAD)"
-          echo "Expected SHA: ${{ steps.check_author.outputs.pr_head_sha }}"
-          if [[ "$(git rev-parse HEAD)" != "${{ steps.check_author.outputs.pr_head_sha }}" ]]; then
-            echo "::error::Checked out SHA does not match expected SHA"
-            exit 1
-          fi
-
-      - name: Set up Python
-        if: steps.check_author.outputs.authorized == 'true'
-        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
-        with:
-          python-version: '3.12'
-          cache: pip
-          cache-dependency-path: |
-            **/requirements*.txt
-            .pre-commit-config.yaml
-
-      - name: Set up Node.js
-        if: steps.check_author.outputs.authorized == 'true'
-        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
-        with:
-          node-version: '20'
-          cache: 'npm'
-          cache-dependency-path: 'src/llama_stack/ui/'
-
-      - name: Install npm dependencies
-        if: steps.check_author.outputs.authorized == 'true'
-        run: npm ci
-        working-directory: src/llama_stack/ui
-
-      - name: Run pre-commit
-        if: steps.check_author.outputs.authorized == 'true'
-        id: precommit
-        uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
-        continue-on-error: true
-        env:
-          SKIP: no-commit-to-branch
-          RUFF_OUTPUT_FORMAT: github
-
-      - name: Check for changes
-        if: steps.check_author.outputs.authorized == 'true'
-        id: changes
-        run: |
-          if ! git diff --exit-code || [ -n "$(git ls-files --others --exclude-standard)" ]; then
-            echo "has_changes=true" >> $GITHUB_OUTPUT
-            echo "Changes detected after pre-commit"
-          else
-            echo "has_changes=false" >> $GITHUB_OUTPUT
-            echo "No changes after pre-commit"
-          fi
-
-      - name: Commit and push changes
-        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'true'
-        run: |
-          git config --local user.email "github-actions[bot]@users.noreply.github.com"
-          git config --local user.name "github-actions[bot]"
-
-          git add -A
-          git commit -m "style: apply pre-commit fixes
-
-          🤖 Applied by @github-actions bot via pre-commit workflow"
-
-          # Push changes
-          git push origin HEAD:${{ steps.check_author.outputs.pr_head_ref }}
-
-      - name: Comment success with changes
-        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'true'
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            await github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: ${{ steps.check_author.outputs.pr_number }},
-              body: `✅ Pre-commit hooks completed successfully!\n\n🔧 Changes have been committed and pushed to the PR branch.`
-            });
-
-      - name: Comment success without changes
-        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'false' && steps.precommit.outcome == 'success'
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            await github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: ${{ steps.check_author.outputs.pr_number }},
-              body: `✅ Pre-commit hooks passed!\n\n✨ No changes needed - your code is already formatted correctly.`
-            });
-
-      - name: Comment failure
-        if: failure()
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            await github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: ${{ steps.check_author.outputs.pr_number }},
-              body: `❌ Pre-commit workflow failed!\n\nPlease check the [workflow logs](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}) for details.`
-            });
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -72,10 +72,16 @@ jobs:
      - name: Build container image
        if: matrix.image-type == 'container'
        run: |
+          BUILD_ARGS="--build-arg INSTALL_MODE=editable --build-arg DISTRO_NAME=${{ matrix.distro }}"
+          if [ -n "${UV_EXTRA_INDEX_URL:-}" ]; then
+            BUILD_ARGS="$BUILD_ARGS --build-arg UV_EXTRA_INDEX_URL=$UV_EXTRA_INDEX_URL"
+          fi
+          if [ -n "${UV_INDEX_STRATEGY:-}" ]; then
+            BUILD_ARGS="$BUILD_ARGS --build-arg UV_INDEX_STRATEGY=$UV_INDEX_STRATEGY"
+          fi
          docker build . \
            -f containers/Containerfile \
-            --build-arg INSTALL_MODE=editable \
-            --build-arg DISTRO_NAME=${{ matrix.distro }} \
+            $BUILD_ARGS \
            --tag llama-stack:${{ matrix.distro }}-ci

      - name: Print dependencies in the image
@ -108,12 +114,18 @@ jobs:
      - name: Build container image
        run: |
          BASE_IMAGE=$(yq -r '.distribution_spec.container_image // "python:3.12-slim"' src/llama_stack/distributions/ci-tests/build.yaml)
+          BUILD_ARGS="--build-arg INSTALL_MODE=editable --build-arg DISTRO_NAME=ci-tests"
+          BUILD_ARGS="$BUILD_ARGS --build-arg BASE_IMAGE=$BASE_IMAGE"
+          BUILD_ARGS="$BUILD_ARGS --build-arg RUN_CONFIG_PATH=/workspace/src/llama_stack/distributions/ci-tests/run.yaml"
+          if [ -n "${UV_EXTRA_INDEX_URL:-}" ]; then
+            BUILD_ARGS="$BUILD_ARGS --build-arg UV_EXTRA_INDEX_URL=$UV_EXTRA_INDEX_URL"
+          fi
+          if [ -n "${UV_INDEX_STRATEGY:-}" ]; then
+            BUILD_ARGS="$BUILD_ARGS --build-arg UV_INDEX_STRATEGY=$UV_INDEX_STRATEGY"
+          fi
          docker build . \
            -f containers/Containerfile \
-            --build-arg INSTALL_MODE=editable \
-            --build-arg DISTRO_NAME=ci-tests \
-            --build-arg BASE_IMAGE="$BASE_IMAGE" \
-            --build-arg RUN_CONFIG_PATH=/workspace/src/llama_stack/distributions/ci-tests/run.yaml \
+            $BUILD_ARGS \
            -t llama-stack:ci-tests

      - name: Inspect the container image entrypoint
@ -148,12 +160,18 @@ jobs:
      - name: Build UBI9 container image
        run: |
          BASE_IMAGE=$(yq -r '.distribution_spec.container_image // "registry.access.redhat.com/ubi9:latest"' src/llama_stack/distributions/ci-tests/build.yaml)
+          BUILD_ARGS="--build-arg INSTALL_MODE=editable --build-arg DISTRO_NAME=ci-tests"
+          BUILD_ARGS="$BUILD_ARGS --build-arg BASE_IMAGE=$BASE_IMAGE"
+          BUILD_ARGS="$BUILD_ARGS --build-arg RUN_CONFIG_PATH=/workspace/src/llama_stack/distributions/ci-tests/run.yaml"
+          if [ -n "${UV_EXTRA_INDEX_URL:-}" ]; then
+            BUILD_ARGS="$BUILD_ARGS --build-arg UV_EXTRA_INDEX_URL=$UV_EXTRA_INDEX_URL"
+          fi
+          if [ -n "${UV_INDEX_STRATEGY:-}" ]; then
+            BUILD_ARGS="$BUILD_ARGS --build-arg UV_INDEX_STRATEGY=$UV_INDEX_STRATEGY"
+          fi
          docker build . \
            -f containers/Containerfile \
-            --build-arg INSTALL_MODE=editable \
-            --build-arg DISTRO_NAME=ci-tests \
-            --build-arg BASE_IMAGE="$BASE_IMAGE" \
-            --build-arg RUN_CONFIG_PATH=/workspace/src/llama_stack/distributions/ci-tests/run.yaml \
+            $BUILD_ARGS \
            -t llama-stack:ci-tests-ubi9

      - name: Inspect UBI9 image
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -24,7 +24,7 @@ jobs:
      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0

    - name: Install uv
-      uses: astral-sh/setup-uv@2ddd2b9cb38ad8efd50337e8ab201519a34c9f24 # v7.1.1
+      uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@ -4,9 +4,13 @@ run-name: Run the unit test suite

 on:
  push:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
  pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
    paths:
      - 'src/llama_stack/**'
      - '!src/llama_stack/ui/**'
--- a/.gitignore
+++ b/.gitignore
@ -32,3 +32,6 @@ CLAUDE.md
 docs/.docusaurus/
 docs/node_modules/
 docs/static/imported-files/
+docs/docs/api-deprecated/
+docs/docs/api-experimental/
+docs/docs/api/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -52,13 +52,9 @@ repos:
        additional_dependencies:
        - black==24.3.0

-   repo: https://github.com/astral-sh/uv-pre-commit
-    rev: 0.7.20
-    hooks:
-    -   id: uv-lock

 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.16.1
+    rev: v1.18.2
    hooks:
    -   id: mypy
        additional_dependencies:
@ -78,11 +74,26 @@ repos:

 -   repo: local
    hooks:
+      - id: uv-lock
+        name: uv-lock
+        additional_dependencies:
+          - uv==0.7.20
+        entry: ./scripts/uv-run-with-index.sh lock
+        language: python
+        pass_filenames: false
+        require_serial: true
+        files: ^(pyproject\.toml|uv\.lock)$
+      - id: mypy-full
+        name: mypy (full type_checking)
+        entry: ./scripts/uv-run-with-index.sh run --group dev --group type_checking mypy
+        language: system
+        pass_filenames: false
+        stages: [manual]
      - id: distro-codegen
        name: Distribution Template Codegen
        additional_dependencies:
          - uv==0.7.8
-        entry: uv run --group codegen ./scripts/distro_codegen.py
+        entry: ./scripts/uv-run-with-index.sh run --group codegen ./scripts/distro_codegen.py
        language: python
        pass_filenames: false
        require_serial: true
@ -91,7 +102,7 @@ repos:
        name: Provider Codegen
        additional_dependencies:
          - uv==0.7.8
-        entry: uv run --group codegen ./scripts/provider_codegen.py
+        entry: ./scripts/uv-run-with-index.sh run --group codegen ./scripts/provider_codegen.py
        language: python
        pass_filenames: false
        require_serial: true
@ -100,7 +111,7 @@ repos:
        name: API Spec Codegen
        additional_dependencies:
          - uv==0.7.8
-        entry: sh -c 'uv run ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
+        entry: sh -c './scripts/uv-run-with-index.sh run ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
        language: python
        pass_filenames: false
        require_serial: true
@ -141,7 +152,7 @@ repos:
        name: Generate CI documentation
        additional_dependencies:
          - uv==0.7.8
-        entry: uv run ./scripts/gen-ci-docs.py
+        entry: ./scripts/uv-run-with-index.sh run ./scripts/gen-ci-docs.py
        language: python
        pass_filenames: false
        require_serial: true
@ -172,6 +183,23 @@ repos:
              exit 1
            fi
            exit 0
+      - id: fips-compliance
+        name: Ensure llama-stack remains FIPS compliant
+        entry: bash
+        language: system
+        types: [python]
+        pass_filenames: true
+        exclude: '^tests/.*$'  # Exclude test dir as some safety tests used MD5
+        args:
+          - -c
+          - |
+            grep -EnH '^[^#]*\b(md5|sha1|uuid3|uuid5)\b' "$@" && {
+              echo;
+              echo "❌ Do not use any of the following functions: hashlib.md5, hashlib.sha1, uuid.uuid3, uuid.uuid5"
+              echo "   These functions are not FIPS-compliant"
+              echo;
+              exit 1;
+            } || true

 ci:
    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -61,6 +61,18 @@ uv run pre-commit run --all-files -v

 The `-v` (verbose) parameter is optional but often helpful for getting more information about any issues with that the pre-commit checks identify.

+To run the expanded mypy configuration that CI enforces, use:
+
+```bash
+uv run pre-commit run mypy-full --hook-stage manual --all-files
+```
+
+or invoke mypy directly with all optional dependencies:
+
+```bash
+uv run --group dev --group type_checking mypy
+```
+
 ```{caution}
 Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
 ```
--- a/client-sdks/stainless/openapi.stainless.yml
+++ b/client-sdks/stainless/openapi.stainless.yml
@ -1,610 +0,0 @@
-# yaml-language-server: $schema=https://app.stainlessapi.com/config-internal.schema.json
-
-organization:
-  # Name of your organization or company, used to determine the name of the client
-  # and headings.
-  name: llama-stack-client
-  docs: https://llama-stack.readthedocs.io/en/latest/
-  contact: llamastack@meta.com
-security:
-  - {}
-  - BearerAuth: []
-security_schemes:
-  BearerAuth:
-    type: http
-    scheme: bearer
-# `targets` define the output targets and their customization options, such as
-# whether to emit the Node SDK and what it's package name should be.
-targets:
-  node:
-    package_name: llama-stack-client
-    production_repo: llamastack/llama-stack-client-typescript
-    publish:
-      npm: false
-  python:
-    package_name: llama_stack_client
-    production_repo: llamastack/llama-stack-client-python
-    options:
-      use_uv: true
-    publish:
-      pypi: true
-    project_name: llama_stack_client
-  kotlin:
-    reverse_domain: com.llama_stack_client.api
-    production_repo: null
-    publish:
-      maven: false
-  go:
-    package_name: llama-stack-client
-    production_repo: llamastack/llama-stack-client-go
-    options:
-      enable_v2: true
-      back_compat_use_shared_package: false
-
-# `client_settings` define settings for the API client, such as extra constructor
-# arguments (used for authentication), retry behavior, idempotency, etc.
-client_settings:
-  default_env_prefix: LLAMA_STACK_CLIENT
-  opts:
-    api_key:
-      type: string
-      read_env: LLAMA_STACK_CLIENT_API_KEY
-      auth: { security_scheme: BearerAuth }
-      nullable: true
-
-# `environments` are a map of the name of the environment (e.g. "sandbox",
-# "production") to the corresponding url to use.
-environments:
-  production: http://any-hosted-llama-stack.com
-
-# `pagination` defines [pagination schemes] which provides a template to match
-# endpoints and generate next-page and auto-pagination helpers in the SDKs.
-pagination:
-  - name: datasets_iterrows
-    type: offset
-    request:
-      dataset_id:
-        type: string
-      start_index:
-        type: integer
-        x-stainless-pagination-property:
-          purpose: offset_count_param
-      limit:
-        type: integer
-    response:
-      data:
-        type: array
-        items:
-          type: object
-      next_index:
-        type: integer
-        x-stainless-pagination-property:
-          purpose: offset_count_start_field
-  - name: openai_cursor_page
-    type: cursor
-    request:
-      limit:
-        type: integer
-      after:
-        type: string
-        x-stainless-pagination-property:
-          purpose: next_cursor_param
-    response:
-      data:
-        type: array
-        items: {}
-      has_more:
-        type: boolean
-      last_id:
-        type: string
-        x-stainless-pagination-property:
-          purpose: next_cursor_field
-# `resources` define the structure and organziation for your API, such as how
-# methods and models are grouped together and accessed. See the [configuration
-# guide] for more information.
-#
-# [configuration guide]:
-#   https://app.stainlessapi.com/docs/guides/configure#resources
-resources:
-  $shared:
-    models:
-      agent_config: AgentConfig
-      interleaved_content_item: InterleavedContentItem
-      interleaved_content: InterleavedContent
-      param_type: ParamType
-      safety_violation: SafetyViolation
-      sampling_params: SamplingParams
-      scoring_result: ScoringResult
-      message: Message
-      user_message: UserMessage
-      completion_message: CompletionMessage
-      tool_response_message: ToolResponseMessage
-      system_message: SystemMessage
-      tool_call: ToolCall
-      query_result: RAGQueryResult
-      document: RAGDocument
-      query_config: RAGQueryConfig
-      response_format: ResponseFormat
-  toolgroups:
-    models:
-      tool_group: ToolGroup
-      list_tool_groups_response: ListToolGroupsResponse
-    methods:
-      register: post /v1/toolgroups
-      get: get /v1/toolgroups/{toolgroup_id}
-      list: get /v1/toolgroups
-      unregister: delete /v1/toolgroups/{toolgroup_id}
-  tools:
-    methods:
-      get: get /v1/tools/{tool_name}
-      list:
-        endpoint: get /v1/tools
-        paginated: false
-
-  tool_runtime:
-    models:
-      tool_def: ToolDef
-      tool_invocation_result: ToolInvocationResult
-    methods:
-      list_tools:
-        endpoint: get /v1/tool-runtime/list-tools
-        paginated: false
-      invoke_tool: post /v1/tool-runtime/invoke
-    subresources:
-      rag_tool:
-        methods:
-          insert: post /v1/tool-runtime/rag-tool/insert
-          query: post /v1/tool-runtime/rag-tool/query
-
-  responses:
-    models:
-      response_object_stream: OpenAIResponseObjectStream
-      response_object: OpenAIResponseObject
-    methods:
-      create:
-        type: http
-        endpoint: post /v1/responses
-        streaming:
-          stream_event_model: responses.response_object_stream
-          param_discriminator: stream
-      retrieve: get /v1/responses/{response_id}
-      list:
-        type: http
-        endpoint: get /v1/responses
-      delete:
-        type: http
-        endpoint: delete /v1/responses/{response_id}
-    subresources:
-      input_items:
-        methods:
-          list:
-            type: http
-            endpoint: get /v1/responses/{response_id}/input_items
-
-  conversations:
-    models:
-      conversation_object: Conversation
-    methods:
-      create:
-        type: http
-        endpoint: post /v1/conversations
-      retrieve: get /v1/conversations/{conversation_id}
-      update:
-        type: http
-        endpoint: post /v1/conversations/{conversation_id}
-      delete:
-        type: http
-        endpoint: delete /v1/conversations/{conversation_id}
-    subresources:
-      items:
-        methods:
-          get:
-            type: http
-            endpoint: get /v1/conversations/{conversation_id}/items/{item_id}
-          list:
-            type: http
-            endpoint: get /v1/conversations/{conversation_id}/items
-          create:
-            type: http
-            endpoint: post /v1/conversations/{conversation_id}/items
-
-  inspect:
-    models:
-      healthInfo: HealthInfo
-      providerInfo: ProviderInfo
-      routeInfo: RouteInfo
-      versionInfo: VersionInfo
-    methods:
-      health: get /v1/health
-      version: get /v1/version
-
-  embeddings:
-    models:
-      create_embeddings_response: OpenAIEmbeddingsResponse
-    methods:
-      create: post /v1/embeddings
-
-  chat:
-    models:
-      chat_completion_chunk: OpenAIChatCompletionChunk
-    subresources:
-      completions:
-        methods:
-          create:
-            type: http
-            endpoint: post /v1/chat/completions
-            streaming:
-              stream_event_model: chat.chat_completion_chunk
-              param_discriminator: stream
-          list:
-            type: http
-            endpoint: get /v1/chat/completions
-          retrieve:
-            type: http
-            endpoint: get /v1/chat/completions/{completion_id}
-  completions:
-    methods:
-      create:
-        type: http
-        endpoint: post /v1/completions
-        streaming:
-          param_discriminator: stream
-
-  vector_io:
-    models:
-      queryChunksResponse: QueryChunksResponse
-    methods:
-      insert: post /v1/vector-io/insert
-      query: post /v1/vector-io/query
-
-  vector_stores:
-    models:
-      vector_store: VectorStoreObject
-      list_vector_stores_response: VectorStoreListResponse
-      vector_store_delete_response: VectorStoreDeleteResponse
-      vector_store_search_response: VectorStoreSearchResponsePage
-    methods:
-      create: post /v1/vector_stores
-      list:
-        endpoint: get /v1/vector_stores
-      retrieve: get /v1/vector_stores/{vector_store_id}
-      update: post /v1/vector_stores/{vector_store_id}
-      delete: delete /v1/vector_stores/{vector_store_id}
-      search: post /v1/vector_stores/{vector_store_id}/search
-    subresources:
-      files:
-        models:
-          vector_store_file: VectorStoreFileObject
-        methods:
-          list: get /v1/vector_stores/{vector_store_id}/files
-          retrieve: get /v1/vector_stores/{vector_store_id}/files/{file_id}
-          update: post /v1/vector_stores/{vector_store_id}/files/{file_id}
-          delete: delete /v1/vector_stores/{vector_store_id}/files/{file_id}
-          create: post /v1/vector_stores/{vector_store_id}/files
-          content: get /v1/vector_stores/{vector_store_id}/files/{file_id}/content
-      file_batches:
-        models:
-          vector_store_file_batches: VectorStoreFileBatchObject
-          list_vector_store_files_in_batch_response: VectorStoreFilesListInBatchResponse
-        methods:
-          create: post /v1/vector_stores/{vector_store_id}/file_batches
-          retrieve: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}
-          list_files: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/files
-          cancel: post /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel
-
-  models:
-    models:
-      model: Model
-      list_models_response: ListModelsResponse
-    methods:
-      retrieve: get /v1/models/{model_id}
-      list:
-        endpoint: get /v1/models
-        paginated: false
-      register: post /v1/models
-      unregister: delete /v1/models/{model_id}
-    subresources:
-      openai:
-        methods:
-          list:
-            endpoint: get /v1/models
-            paginated: false
-
-  providers:
-    models:
-      list_providers_response: ListProvidersResponse
-    methods:
-      list:
-        endpoint: get /v1/providers
-        paginated: false
-      retrieve: get /v1/providers/{provider_id}
-
-  routes:
-    models:
-      list_routes_response: ListRoutesResponse
-    methods:
-      list:
-        endpoint: get /v1/inspect/routes
-        paginated: false
-
-
-  moderations:
-    models:
-      create_response: ModerationObject
-    methods:
-      create: post /v1/moderations
-
-
-  safety:
-    models:
-      run_shield_response: RunShieldResponse
-    methods:
-      run_shield: post /v1/safety/run-shield
-
-
-  shields:
-    models:
-      shield: Shield
-      list_shields_response: ListShieldsResponse
-    methods:
-      retrieve: get /v1/shields/{identifier}
-      list:
-        endpoint: get /v1/shields
-        paginated: false
-      register: post /v1/shields
-      delete: delete /v1/shields/{identifier}
-
-  synthetic_data_generation:
-    models:
-      syntheticDataGenerationResponse: SyntheticDataGenerationResponse
-    methods:
-      generate: post /v1/synthetic-data-generation/generate
-
-  telemetry:
-    models:
-      span_with_status: SpanWithStatus
-      trace: Trace
-      query_spans_response: QuerySpansResponse
-      event: Event
-      query_condition: QueryCondition
-    methods:
-      query_traces:
-        endpoint: post /v1alpha/telemetry/traces
-        skip_test_reason: 'unsupported query params in java / kotlin'
-      get_span_tree: post /v1alpha/telemetry/spans/{span_id}/tree
-      query_spans:
-        endpoint: post /v1alpha/telemetry/spans
-        skip_test_reason: 'unsupported query params in java / kotlin'
-      query_metrics:
-        endpoint: post /v1alpha/telemetry/metrics/{metric_name}
-        skip_test_reason: 'unsupported query params in java / kotlin'
-      # log_event: post /v1alpha/telemetry/events
-      save_spans_to_dataset: post /v1alpha/telemetry/spans/export
-      get_span: get /v1alpha/telemetry/traces/{trace_id}/spans/{span_id}
-      get_trace: get /v1alpha/telemetry/traces/{trace_id}
-
-  scoring:
-    methods:
-      score: post /v1/scoring/score
-      score_batch: post /v1/scoring/score-batch
-  scoring_functions:
-    methods:
-      retrieve: get /v1/scoring-functions/{scoring_fn_id}
-      list:
-        endpoint: get /v1/scoring-functions
-        paginated: false
-      register: post /v1/scoring-functions
-    models:
-      scoring_fn: ScoringFn
-      scoring_fn_params: ScoringFnParams
-      list_scoring_functions_response: ListScoringFunctionsResponse
-
-  benchmarks:
-    methods:
-      retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}
-      list:
-        endpoint: get /v1alpha/eval/benchmarks
-        paginated: false
-      register: post /v1alpha/eval/benchmarks
-    models:
-      benchmark: Benchmark
-      list_benchmarks_response: ListBenchmarksResponse
-
-  files:
-    methods:
-      create: post /v1/files
-      list: get /v1/files
-      retrieve: get /v1/files/{file_id}
-      delete: delete /v1/files/{file_id}
-      content: get /v1/files/{file_id}/content
-    models:
-      file: OpenAIFileObject
-      list_files_response: ListOpenAIFileResponse
-      delete_file_response: OpenAIFileDeleteResponse
-
-  alpha:
-    subresources:
-      inference:
-        methods:
-          rerank: post /v1alpha/inference/rerank
-
-      post_training:
-        models:
-          algorithm_config: AlgorithmConfig
-          post_training_job: PostTrainingJob
-          list_post_training_jobs_response: ListPostTrainingJobsResponse
-        methods:
-          preference_optimize: post /v1alpha/post-training/preference-optimize
-          supervised_fine_tune: post /v1alpha/post-training/supervised-fine-tune
-        subresources:
-          job:
-            methods:
-              artifacts: get /v1alpha/post-training/job/artifacts
-              cancel: post /v1alpha/post-training/job/cancel
-              status: get /v1alpha/post-training/job/status
-              list:
-                endpoint: get /v1alpha/post-training/jobs
-                paginated: false
-
-      eval:
-        methods:
-          evaluate_rows: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
-          run_eval: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
-          evaluate_rows_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
-          run_eval_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
-
-        subresources:
-          jobs:
-            methods:
-              cancel: delete /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
-              status: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
-              retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result
-        models:
-          evaluate_response: EvaluateResponse
-          benchmark_config: BenchmarkConfig
-          job: Job
-
-      agents:
-        methods:
-          create: post /v1alpha/agents
-          list: get /v1alpha/agents
-          retrieve: get /v1alpha/agents/{agent_id}
-          delete: delete /v1alpha/agents/{agent_id}
-        models:
-          inference_step: InferenceStep
-          tool_execution_step: ToolExecutionStep
-          tool_response: ToolResponse
-          shield_call_step: ShieldCallStep
-          memory_retrieval_step: MemoryRetrievalStep
-        subresources:
-          session:
-            models:
-              session: Session
-            methods:
-              list: get /v1alpha/agents/{agent_id}/sessions
-              create: post /v1alpha/agents/{agent_id}/session
-              delete: delete /v1alpha/agents/{agent_id}/session/{session_id}
-              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}
-          steps:
-            methods:
-              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}
-          turn:
-            models:
-              turn: Turn
-              turn_response_event: AgentTurnResponseEvent
-              agent_turn_response_stream_chunk: AgentTurnResponseStreamChunk
-            methods:
-              create:
-                type: http
-                endpoint: post /v1alpha/agents/{agent_id}/session/{session_id}/turn
-                streaming:
-                  stream_event_model: alpha.agents.turn.agent_turn_response_stream_chunk
-                  param_discriminator: stream
-              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}
-              resume:
-                type: http
-                endpoint: post /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume
-                streaming:
-                  stream_event_model: alpha.agents.turn.agent_turn_response_stream_chunk
-                  param_discriminator: stream
-
-  beta:
-    subresources:
-      datasets:
-        models:
-          list_datasets_response: ListDatasetsResponse
-        methods:
-          register: post /v1beta/datasets
-          retrieve: get /v1beta/datasets/{dataset_id}
-          list:
-            endpoint: get /v1beta/datasets
-            paginated: false
-          unregister: delete /v1beta/datasets/{dataset_id}
-          iterrows: get /v1beta/datasetio/iterrows/{dataset_id}
-          appendrows: post /v1beta/datasetio/append-rows/{dataset_id}
-
-
-settings:
-  license: MIT
-  unwrap_response_fields: [ data ]
-
-openapi:
-  transformations:
-    - command: renameValue
-      reason: pydantic reserved name
-      args:
-        filter:
-          only:
-            - '$.components.schemas.InferenceStep.properties.model_response'
-        rename:
-          python:
-            property_name: 'inference_model_response'
-
-    # - command: renameValue
-    #   reason: pydantic reserved name
-    #   args:
-    #     filter:
-    #       only:
-    #         - '$.components.schemas.Model.properties.model_type'
-    #     rename:
-    #       python:
-    #         property_name: 'type'
-    - command: mergeObject
-      reason: Better return_type using enum
-      args:
-        target:
-          - '$.components.schemas'
-        object:
-          ReturnType:
-            additionalProperties: false
-            properties:
-              type:
-                enum:
-                  - string
-                  - number
-                  - boolean
-                  - array
-                  - object
-                  - json
-                  - union
-                  - chat_completion_input
-                  - completion_input
-                  - agent_turn_input
-            required:
-              - type
-            type: object
-    - command: replaceProperties
-      reason: Replace return type properties with better model (see above)
-      args:
-        filter:
-          only:
-            - '$.components.schemas.ScoringFn.properties.return_type'
-            - '$.components.schemas.RegisterScoringFunctionRequest.properties.return_type'
-        value:
-          $ref: '#/components/schemas/ReturnType'
-    - command: oneOfToAnyOf
-      reason: Prism (mock server) doesn't like one of our requests as it technically matches multiple variants
-    - reason: For better names
-      command: extractToRefs
-      args:
-        ref:
-          target: '$.components.schemas.ToolCallDelta.properties.tool_call'
-          name: '#/components/schemas/ToolCallOrString'
-
-# `readme` is used to configure the code snippets that will be rendered in the
-# README.md of various SDKs. In particular, you can change the `headline`
-# snippet's endpoint and the arguments to call it with.
-readme:
-  example_requests:
-    default:
-      type: request
-      endpoint: post /v1/chat/completions
-      params: &ref_0 {}
-    headline:
-      type: request
-      endpoint: post /v1/models
-      params: *ref_0
-    pagination:
-      type: request
-      endpoint: post /v1/chat/completions
-      params: {}
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
--- a/containers/Containerfile
+++ b/containers/Containerfile
@ -19,6 +19,8 @@ ARG KEEP_WORKSPACE=""
 ARG DISTRO_NAME="starter"
 ARG RUN_CONFIG_PATH=""
 ARG UV_HTTP_TIMEOUT=500
+ARG UV_EXTRA_INDEX_URL=""
+ARG UV_INDEX_STRATEGY=""
 ENV UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT}
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PIP_DISABLE_PIP_VERSION_CHECK=1
@ -45,7 +47,7 @@ RUN set -eux; \
        exit 1; \
    fi

-RUN pip install --no-cache uv
+RUN pip install --no-cache-dir uv
 ENV UV_SYSTEM_PYTHON=1

 ENV INSTALL_MODE=${INSTALL_MODE}
@ -62,47 +64,60 @@ COPY . /workspace

 # Install the client package if it is provided
 # NOTE: this is installed before llama-stack since llama-stack depends on llama-stack-client-python
+# Unset UV index env vars to ensure we only use PyPI for the client
 RUN set -eux; \
+    unset UV_EXTRA_INDEX_URL UV_INDEX_STRATEGY; \
    if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then \
        if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then \
            echo "LLAMA_STACK_CLIENT_DIR is set but $LLAMA_STACK_CLIENT_DIR does not exist" >&2; \
            exit 1; \
        fi; \
-        uv pip install --no-cache -e "$LLAMA_STACK_CLIENT_DIR"; \
+        uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"; \
    fi;

 # Install llama-stack
+# Use UV_EXTRA_INDEX_URL inline only for editable install with RC dependencies
 RUN set -eux; \
+    SAVED_UV_EXTRA_INDEX_URL="${UV_EXTRA_INDEX_URL:-}"; \
+    SAVED_UV_INDEX_STRATEGY="${UV_INDEX_STRATEGY:-}"; \
+    unset UV_EXTRA_INDEX_URL UV_INDEX_STRATEGY; \
    if [ "$INSTALL_MODE" = "editable" ]; then \
        if [ ! -d "$LLAMA_STACK_DIR" ]; then \
            echo "INSTALL_MODE=editable requires LLAMA_STACK_DIR to point to a directory inside the build context" >&2; \
            exit 1; \
        fi; \
-        uv pip install --no-cache -e "$LLAMA_STACK_DIR"; \
-    elif [ "$INSTALL_MODE" = "test-pypi" ]; then \
-        uv pip install --no-cache fastapi libcst; \
-        if [ -n "$TEST_PYPI_VERSION" ]; then \
-            uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match "llama-stack==$TEST_PYPI_VERSION"; \
+        if [ -n "$SAVED_UV_EXTRA_INDEX_URL" ] && [ -n "$SAVED_UV_INDEX_STRATEGY" ]; then \
+            UV_EXTRA_INDEX_URL="$SAVED_UV_EXTRA_INDEX_URL" UV_INDEX_STRATEGY="$SAVED_UV_INDEX_STRATEGY" \
+                uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"; \
        else \
-            uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match llama-stack; \
+            uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"; \
+        fi; \
+    elif [ "$INSTALL_MODE" = "test-pypi" ]; then \
+        uv pip install --no-cache-dir fastapi libcst; \
+        if [ -n "$TEST_PYPI_VERSION" ]; then \
+            uv pip install --no-cache-dir --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match "llama-stack==$TEST_PYPI_VERSION"; \
+        else \
+            uv pip install --no-cache-dir --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match llama-stack; \
        fi; \
    else \
        if [ -n "$PYPI_VERSION" ]; then \
-            uv pip install --no-cache "llama-stack==$PYPI_VERSION"; \
+            uv pip install --no-cache-dir "llama-stack==$PYPI_VERSION"; \
        else \
-            uv pip install --no-cache llama-stack; \
+            uv pip install --no-cache-dir llama-stack; \
        fi; \
    fi;

 # Install the dependencies for the distribution
+# Explicitly unset UV index env vars to ensure we only use PyPI for distribution deps
 RUN set -eux; \
+    unset UV_EXTRA_INDEX_URL UV_INDEX_STRATEGY; \
    if [ -z "$DISTRO_NAME" ]; then \
        echo "DISTRO_NAME must be provided" >&2; \
        exit 1; \
    fi; \
    deps="$(llama stack list-deps "$DISTRO_NAME")"; \
    if [ -n "$deps" ]; then \
-        printf '%s\n' "$deps" | xargs -L1 uv pip install --no-cache; \
+        printf '%s\n' "$deps" | xargs -L1 uv pip install --no-cache-dir; \
    fi

 # Cleanup
--- a/docs/docs/concepts/apis/index.mdx
+++ b/docs/docs/concepts/apis/index.mdx
@ -23,5 +23,4 @@ A Llama Stack API is described as a collection of REST endpoints. We currently s
 We are working on adding a few more APIs to complete the application lifecycle. These will include:
 - **Batch Inference**: run inference on a dataset of inputs
 - **Batch Agents**: run agents on a dataset of inputs
- **Synthetic Data Generation**: generate synthetic data for model development
 - **Batches**: OpenAI-compatible batch management for inference
--- a/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
@ -79,6 +79,33 @@ docker run \
  --port $LLAMA_STACK_PORT
 ```

+### Via Docker with Custom Run Configuration
+
+You can also run the Docker container with a custom run configuration file by mounting it into the container:
+
+```bash
+# Set the path to your custom run.yaml file
+CUSTOM_RUN_CONFIG=/path/to/your/custom-run.yaml
+LLAMA_STACK_PORT=8321
+
+docker run \
+  -it \
+  --pull always \
+  --gpu all \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  -v $CUSTOM_RUN_CONFIG:/app/custom-run.yaml \
+  -e RUN_CONFIG_PATH=/app/custom-run.yaml \
+  llamastack/distribution-meta-reference-gpu \
+  --port $LLAMA_STACK_PORT
+```
+
+**Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
+
+Available run configurations for this distribution:
+- `run.yaml`
+- `run-with-safety.yaml`
+
 ### Via venv

 Make sure you have the Llama Stack CLI available.
--- a/docs/docs/distributions/self_hosted_distro/nvidia.md
+++ b/docs/docs/distributions/self_hosted_distro/nvidia.md
@ -127,13 +127,39 @@ docker run \
  -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
+  -v ~/.llama:/root/.llama \
  -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
  llamastack/distribution-nvidia \
-  --config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT
 ```

+### Via Docker with Custom Run Configuration
+
+You can also run the Docker container with a custom run configuration file by mounting it into the container:
+
+```bash
+# Set the path to your custom run.yaml file
+CUSTOM_RUN_CONFIG=/path/to/your/custom-run.yaml
+LLAMA_STACK_PORT=8321
+
+docker run \
+  -it \
+  --pull always \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  -v $CUSTOM_RUN_CONFIG:/app/custom-run.yaml \
+  -e RUN_CONFIG_PATH=/app/custom-run.yaml \
+  -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
+  llamastack/distribution-nvidia \
+  --port $LLAMA_STACK_PORT
+```
+
+**Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
+
+Available run configurations for this distribution:
+- `run.yaml`
+- `run-with-safety.yaml`
+
 ### Via venv

 If you've set up your local development environment, you can also install the distribution dependencies using your local virtual environment.
--- a/docs/docs/getting_started/detailed_tutorial.mdx
+++ b/docs/docs/getting_started/detailed_tutorial.mdx
@ -239,8 +239,13 @@ client = LlamaStackClient(base_url="http://localhost:8321")
 models = client.models.list()

 # Select the first LLM
-llm = next(m for m in models if m.model_type == "llm" and m.provider_id == "ollama")
-model_id = llm.identifier
+llm = next(
+    m for m in models
+    if m.custom_metadata
+    and m.custom_metadata.get("model_type") == "llm"
+    and m.custom_metadata.get("provider_id") == "ollama"
+)
+model_id = llm.id

 print("Model:", model_id)

@ -279,8 +284,13 @@ import uuid
 client = LlamaStackClient(base_url=f"http://localhost:8321")

 models = client.models.list()
-llm = next(m for m in models if m.model_type == "llm" and m.provider_id == "ollama")
-model_id = llm.identifier
+llm = next(
+    m for m in models
+    if m.custom_metadata
+    and m.custom_metadata.get("model_type") == "llm"
+    and m.custom_metadata.get("provider_id") == "ollama"
+)
+model_id = llm.id

 agent = Agent(client, model=model_id, instructions="You are a helpful assistant.")

@ -450,8 +460,11 @@ import uuid
 client = LlamaStackClient(base_url="http://localhost:8321")

 # Create a vector database instance
-embed_lm = next(m for m in client.models.list() if m.model_type == "embedding")
-embedding_model = embed_lm.identifier
+embed_lm = next(
+    m for m in client.models.list()
+    if m.custom_metadata and m.custom_metadata.get("model_type") == "embedding"
+)
+embedding_model = embed_lm.id
 vector_db_id = f"v{uuid.uuid4().hex}"
 # The VectorDB API is deprecated; the server now returns its own authoritative ID.
 # We capture the correct ID from the response's .identifier attribute.
@ -489,9 +502,11 @@ client.tool_runtime.rag_tool.insert(
 llm = next(
    m
    for m in client.models.list()
-    if m.model_type == "llm" and m.provider_id == "ollama"
+    if m.custom_metadata
+    and m.custom_metadata.get("model_type") == "llm"
+    and m.custom_metadata.get("provider_id") == "ollama"
 )
-model = llm.identifier
+model = llm.id

 # Create the RAG agent
 rag_agent = Agent(
--- a/docs/docs/providers/files/remote_openai.mdx
+++ b/docs/docs/providers/files/remote_openai.mdx
@ -0,0 +1,27 @@
+---
+description: "OpenAI Files API provider for managing files through OpenAI's native file storage service."
+sidebar_label: Remote - Openai
+title: remote::openai
+---
+
+# remote::openai
+
+## Description
+
+OpenAI Files API provider for managing files through OpenAI's native file storage service.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `api_key` | `<class 'str'>` | No |  | OpenAI API key for authentication |
+| `metadata_store` | `<class 'llama_stack.core.storage.datatypes.SqlStoreReference'>` | No |  | SQL store configuration for file metadata |
+
+## Sample Configuration
+
+```yaml
+api_key: ${env.OPENAI_API_KEY}
+metadata_store:
+  table_name: openai_files_metadata
+  backend: sql_default
+```
--- a/docs/docs/providers/inference/remote_nvidia.mdx
+++ b/docs/docs/providers/inference/remote_nvidia.mdx
@ -20,6 +20,7 @@ NVIDIA inference provider for accessing NVIDIA NIM models and AI services.
 | `url` | `<class 'str'>` | No | https://integrate.api.nvidia.com | A base url for accessing the NVIDIA NIM |
 | `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |
 | `append_api_version` | `<class 'bool'>` | No | True | When set to false, the API version will not be appended to the base_url. By default, it is true. |
+| `rerank_model_to_url` | `dict[str, str` | No | `{'nv-rerank-qa-mistral-4b:1': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking', 'nvidia/nv-rerankqa-mistral-4b-v3': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/nv-rerankqa-mistral-4b-v3/reranking', 'nvidia/llama-3.2-nv-rerankqa-1b-v2': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking'}` | Mapping of rerank model identifiers to their API endpoints.  |

 ## Sample Configuration

--- a/docs/notebooks/llamastack_agents_getting_started_examples.ipynb
+++ b/docs/notebooks/llamastack_agents_getting_started_examples.ipynb
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@ -84,7 +84,6 @@ def generate_spec(output_dir: Path, stability_filter: str = None, main_spec: boo
    )

    yaml_filename = f"{filename_prefix}llama-stack-spec.yaml"
-    html_filename = f"{filename_prefix}llama-stack-spec.html"

    with open(output_dir / yaml_filename, "w", encoding="utf-8") as fp:
        y = yaml.YAML()
@ -102,11 +101,6 @@ def generate_spec(output_dir: Path, stability_filter: str = None, main_spec: boo
            fp,
        )

-    with open(output_dir / html_filename, "w") as fp:
-        spec.write_html(fp, pretty_print=True)
-
-    print(f"Generated {yaml_filename} and {html_filename}")
-
 def main(output_dir: str):
    output_dir = Path(output_dir)
    if not output_dir.exists():
--- a/docs/sidebars.ts
+++ b/docs/sidebars.ts
@ -242,15 +242,6 @@ const sidebars: SidebarsConfig = {
            'providers/eval/remote_nvidia'
          ],
        },
-        {
-          type: 'category',
-          label: 'Telemetry',
-          collapsed: true,
-          items: [
-            'providers/telemetry/index',
-            'providers/telemetry/inline_meta-reference'
-          ],
-        },
        {
          type: 'category',
          label: 'Batches',
--- a/docs/static/deprecated-llama-stack-spec.html
+++ b/docs/static/deprecated-llama-stack-spec.html
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
--- a/docs/static/experimental-llama-stack-spec.html
+++ b/docs/static/experimental-llama-stack-spec.html
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
--- a/docs/static/stainless-llama-stack-spec.html
+++ b/docs/static/stainless-llama-stack-spec.html
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
--- a/llama_stack/distributions/ci-tests/build.yaml
+++ b/llama_stack/distributions/ci-tests/build.yaml
@ -1,59 +0,0 @@
-version: 2
-distribution_spec:
-  description: CI tests for Llama Stack
-  providers:
-    inference:
-    - provider_type: remote::cerebras
-    - provider_type: remote::ollama
-    - provider_type: remote::vllm
-    - provider_type: remote::tgi
-    - provider_type: remote::fireworks
-    - provider_type: remote::together
-    - provider_type: remote::bedrock
-    - provider_type: remote::nvidia
-    - provider_type: remote::openai
-    - provider_type: remote::anthropic
-    - provider_type: remote::gemini
-    - provider_type: remote::vertexai
-    - provider_type: remote::groq
-    - provider_type: remote::sambanova
-    - provider_type: remote::azure
-    - provider_type: inline::sentence-transformers
-    vector_io:
-    - provider_type: inline::faiss
-    - provider_type: inline::sqlite-vec
-    - provider_type: inline::milvus
-    - provider_type: remote::chromadb
-    - provider_type: remote::pgvector
-    - provider_type: remote::qdrant
-    - provider_type: remote::weaviate
-    files:
-    - provider_type: inline::localfs
-    safety:
-    - provider_type: inline::llama-guard
-    - provider_type: inline::code-scanner
-    agents:
-    - provider_type: inline::meta-reference
-    post_training:
-    - provider_type: inline::torchtune-cpu
-    eval:
-    - provider_type: inline::meta-reference
-    datasetio:
-    - provider_type: remote::huggingface
-    - provider_type: inline::localfs
-    scoring:
-    - provider_type: inline::basic
-    - provider_type: inline::llm-as-judge
-    - provider_type: inline::braintrust
-    tool_runtime:
-    - provider_type: remote::brave-search
-    - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
-    - provider_type: remote::model-context-protocol
-    batches:
-    - provider_type: inline::reference
-image_type: venv
-additional_pip_packages:
- aiosqlite
- asyncpg
- sqlalchemy[asyncio]
--- a/llama_stack/distributions/ci-tests/run.yaml
+++ b/llama_stack/distributions/ci-tests/run.yaml
@ -1,281 +0,0 @@
-version: 2
-image_name: ci-tests
-apis:
- agents
- batches
- datasetio
- eval
- files
- inference
- post_training
- safety
- scoring
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
-    provider_type: remote::cerebras
-    config:
-      base_url: https://api.cerebras.ai
-      api_key: ${env.CEREBRAS_API_KEY:=}
-  - provider_id: ${env.OLLAMA_URL:+ollama}
-    provider_type: remote::ollama
-    config:
-      url: ${env.OLLAMA_URL:=http://localhost:11434}
-  - provider_id: ${env.VLLM_URL:+vllm}
-    provider_type: remote::vllm
-    config:
-      url: ${env.VLLM_URL:=}
-      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-      api_token: ${env.VLLM_API_TOKEN:=fake}
-      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-  - provider_id: ${env.TGI_URL:+tgi}
-    provider_type: remote::tgi
-    config:
-      url: ${env.TGI_URL:=}
-  - provider_id: fireworks
-    provider_type: remote::fireworks
-    config:
-      url: https://api.fireworks.ai/inference/v1
-      api_key: ${env.FIREWORKS_API_KEY:=}
-  - provider_id: together
-    provider_type: remote::together
-    config:
-      url: https://api.together.xyz/v1
-      api_key: ${env.TOGETHER_API_KEY:=}
-  - provider_id: bedrock
-    provider_type: remote::bedrock
-  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
-    provider_type: remote::nvidia
-    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
-      api_key: ${env.NVIDIA_API_KEY:=}
-      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
-  - provider_id: openai
-    provider_type: remote::openai
-    config:
-      api_key: ${env.OPENAI_API_KEY:=}
-      base_url: ${env.OPENAI_BASE_URL:=https://api.openai.com/v1}
-  - provider_id: anthropic
-    provider_type: remote::anthropic
-    config:
-      api_key: ${env.ANTHROPIC_API_KEY:=}
-  - provider_id: gemini
-    provider_type: remote::gemini
-    config:
-      api_key: ${env.GEMINI_API_KEY:=}
-  - provider_id: ${env.VERTEX_AI_PROJECT:+vertexai}
-    provider_type: remote::vertexai
-    config:
-      project: ${env.VERTEX_AI_PROJECT:=}
-      location: ${env.VERTEX_AI_LOCATION:=us-central1}
-  - provider_id: groq
-    provider_type: remote::groq
-    config:
-      url: https://api.groq.com
-      api_key: ${env.GROQ_API_KEY:=}
-  - provider_id: sambanova
-    provider_type: remote::sambanova
-    config:
-      url: https://api.sambanova.ai/v1
-      api_key: ${env.SAMBANOVA_API_KEY:=}
-  - provider_id: ${env.AZURE_API_KEY:+azure}
-    provider_type: remote::azure
-    config:
-      api_key: ${env.AZURE_API_KEY:=}
-      api_base: ${env.AZURE_API_BASE:=}
-      api_version: ${env.AZURE_API_VERSION:=}
-      api_type: ${env.AZURE_API_TYPE:=}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      persistence:
-        namespace: vector_io::faiss
-        backend: kv_default
-  - provider_id: sqlite-vec
-    provider_type: inline::sqlite-vec
-    config:
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/sqlite_vec.db
-      persistence:
-        namespace: vector_io::sqlite_vec
-        backend: kv_default
-  - provider_id: ${env.MILVUS_URL:+milvus}
-    provider_type: inline::milvus
-    config:
-      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/ci-tests}/milvus.db
-      persistence:
-        namespace: vector_io::milvus
-        backend: kv_default
-  - provider_id: ${env.CHROMADB_URL:+chromadb}
-    provider_type: remote::chromadb
-    config:
-      url: ${env.CHROMADB_URL:=}
-      persistence:
-        namespace: vector_io::chroma_remote
-        backend: kv_default
-  - provider_id: ${env.PGVECTOR_DB:+pgvector}
-    provider_type: remote::pgvector
-    config:
-      host: ${env.PGVECTOR_HOST:=localhost}
-      port: ${env.PGVECTOR_PORT:=5432}
-      db: ${env.PGVECTOR_DB:=}
-      user: ${env.PGVECTOR_USER:=}
-      password: ${env.PGVECTOR_PASSWORD:=}
-      persistence:
-        namespace: vector_io::pgvector
-        backend: kv_default
-  - provider_id: ${env.QDRANT_URL:+qdrant}
-    provider_type: remote::qdrant
-    config:
-      api_key: ${env.QDRANT_API_KEY:=}
-      persistence:
-        namespace: vector_io::qdrant_remote
-        backend: kv_default
-  - provider_id: ${env.WEAVIATE_CLUSTER_URL:+weaviate}
-    provider_type: remote::weaviate
-    config:
-      weaviate_api_key: null
-      weaviate_cluster_url: ${env.WEAVIATE_CLUSTER_URL:=localhost:8080}
-      persistence:
-        namespace: vector_io::weaviate
-        backend: kv_default
-  files:
-  - provider_id: meta-reference-files
-    provider_type: inline::localfs
-    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/ci-tests/files}
-      metadata_store:
-        table_name: files_metadata
-        backend: sql_default
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  - provider_id: code-scanner
-    provider_type: inline::code-scanner
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence:
-        agent_state:
-          namespace: agents
-          backend: kv_default
-        responses:
-          table_name: responses
-          backend: sql_default
-          max_write_queue_size: 10000
-          num_writers: 4
-  post_training:
-  - provider_id: torchtune-cpu
-    provider_type: inline::torchtune-cpu
-    config:
-      checkpoint_format: meta
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-  batches:
-  - provider_id: reference
-    provider_type: inline::reference
-    config:
-      kvstore:
-        namespace: batches
-        backend: kv_default
-storage:
-  backends:
-    kv_default:
-      type: kv_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/kvstore.db
-    sql_default:
-      type: sql_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/sql_store.db
-  stores:
-    metadata:
-      namespace: registry
-      backend: kv_default
-    inference:
-      table_name: inference_store
-      backend: sql_default
-      max_write_queue_size: 10000
-      num_writers: 4
-    conversations:
-      table_name: openai_conversations
-      backend: sql_default
-    prompts:
-      namespace: prompts
-      backend: kv_default
-registered_resources:
-  models: []
-  shields:
-  - shield_id: llama-guard
-    provider_id: ${env.SAFETY_MODEL:+llama-guard}
-    provider_shield_id: ${env.SAFETY_MODEL:=}
-  - shield_id: code-scanner
-    provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
-    provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
-  vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
-  tool_groups:
-  - toolgroup_id: builtin::websearch
-    provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
-server:
-  port: 8321
-telemetry:
-  enabled: true
-vector_stores:
-  default_provider_id: faiss
-  default_embedding_model:
-    provider_id: sentence-transformers
-    model_id: nomic-ai/nomic-embed-text-v1.5
-safety:
-  default_shield_id: llama-guard
--- a/llama_stack/distributions/dell/build.yaml
+++ b/llama_stack/distributions/dell/build.yaml
@ -1,33 +0,0 @@
-version: 2
-distribution_spec:
-  description: Dell's distribution of Llama Stack. TGI inference via Dell's custom
-    container
-  providers:
-    inference:
-    - provider_type: remote::tgi
-    - provider_type: inline::sentence-transformers
-    vector_io:
-    - provider_type: inline::faiss
-    - provider_type: remote::chromadb
-    - provider_type: remote::pgvector
-    safety:
-    - provider_type: inline::llama-guard
-    agents:
-    - provider_type: inline::meta-reference
-    eval:
-    - provider_type: inline::meta-reference
-    datasetio:
-    - provider_type: remote::huggingface
-    - provider_type: inline::localfs
-    scoring:
-    - provider_type: inline::basic
-    - provider_type: inline::llm-as-judge
-    - provider_type: inline::braintrust
-    tool_runtime:
-    - provider_type: remote::brave-search
-    - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
-image_type: venv
-additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
--- a/llama_stack/distributions/dell/run-with-safety.yaml
+++ b/llama_stack/distributions/dell/run-with-safety.yaml
@ -1,144 +0,0 @@
-version: 2
-image_name: dell
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: tgi0
-    provider_type: remote::tgi
-    config:
-      url: ${env.DEH_URL}
-  - provider_id: tgi1
-    provider_type: remote::tgi
-    config:
-      url: ${env.DEH_SAFETY_URL}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-  vector_io:
-  - provider_id: chromadb
-    provider_type: remote::chromadb
-    config:
-      url: ${env.CHROMADB_URL:=}
-      persistence:
-        namespace: vector_io::chroma_remote
-        backend: kv_default
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence:
-        agent_state:
-          namespace: agents
-          backend: kv_default
-        responses:
-          table_name: responses
-          backend: sql_default
-          max_write_queue_size: 10000
-          num_writers: 4
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-storage:
-  backends:
-    kv_default:
-      type: kv_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/kvstore.db
-    sql_default:
-      type: sql_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/sql_store.db
-  stores:
-    metadata:
-      namespace: registry
-      backend: kv_default
-    inference:
-      table_name: inference_store
-      backend: sql_default
-      max_write_queue_size: 10000
-      num_writers: 4
-    conversations:
-      table_name: openai_conversations
-      backend: sql_default
-    prompts:
-      namespace: prompts
-      backend: kv_default
-registered_resources:
-  models:
-  - metadata: {}
-    model_id: ${env.INFERENCE_MODEL}
-    provider_id: tgi0
-    model_type: llm
-  - metadata: {}
-    model_id: ${env.SAFETY_MODEL}
-    provider_id: tgi1
-    model_type: llm
-  - metadata:
-      embedding_dimension: 768
-    model_id: nomic-embed-text-v1.5
-    provider_id: sentence-transformers
-    model_type: embedding
-  shields:
-  - shield_id: ${env.SAFETY_MODEL}
-  vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
-  tool_groups:
-  - toolgroup_id: builtin::websearch
-    provider_id: brave-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
-server:
-  port: 8321
-telemetry:
-  enabled: true
--- a/llama_stack/distributions/dell/run.yaml
+++ b/llama_stack/distributions/dell/run.yaml
@ -1,135 +0,0 @@
-version: 2
-image_name: dell
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: tgi0
-    provider_type: remote::tgi
-    config:
-      url: ${env.DEH_URL}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-  vector_io:
-  - provider_id: chromadb
-    provider_type: remote::chromadb
-    config:
-      url: ${env.CHROMADB_URL:=}
-      persistence:
-        namespace: vector_io::chroma_remote
-        backend: kv_default
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence:
-        agent_state:
-          namespace: agents
-          backend: kv_default
-        responses:
-          table_name: responses
-          backend: sql_default
-          max_write_queue_size: 10000
-          num_writers: 4
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-storage:
-  backends:
-    kv_default:
-      type: kv_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/kvstore.db
-    sql_default:
-      type: sql_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/sql_store.db
-  stores:
-    metadata:
-      namespace: registry
-      backend: kv_default
-    inference:
-      table_name: inference_store
-      backend: sql_default
-      max_write_queue_size: 10000
-      num_writers: 4
-    conversations:
-      table_name: openai_conversations
-      backend: sql_default
-    prompts:
-      namespace: prompts
-      backend: kv_default
-registered_resources:
-  models:
-  - metadata: {}
-    model_id: ${env.INFERENCE_MODEL}
-    provider_id: tgi0
-    model_type: llm
-  - metadata:
-      embedding_dimension: 768
-    model_id: nomic-embed-text-v1.5
-    provider_id: sentence-transformers
-    model_type: embedding
-  shields: []
-  vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
-  tool_groups:
-  - toolgroup_id: builtin::websearch
-    provider_id: brave-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
-server:
-  port: 8321
-telemetry:
-  enabled: true
--- a/llama_stack/distributions/meta-reference-gpu/build.yaml
+++ b/llama_stack/distributions/meta-reference-gpu/build.yaml
@ -1,32 +0,0 @@
-version: 2
-distribution_spec:
-  description: Use Meta Reference for running LLM inference
-  providers:
-    inference:
-    - provider_type: inline::meta-reference
-    vector_io:
-    - provider_type: inline::faiss
-    - provider_type: remote::chromadb
-    - provider_type: remote::pgvector
-    safety:
-    - provider_type: inline::llama-guard
-    agents:
-    - provider_type: inline::meta-reference
-    eval:
-    - provider_type: inline::meta-reference
-    datasetio:
-    - provider_type: remote::huggingface
-    - provider_type: inline::localfs
-    scoring:
-    - provider_type: inline::basic
-    - provider_type: inline::llm-as-judge
-    - provider_type: inline::braintrust
-    tool_runtime:
-    - provider_type: remote::brave-search
-    - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
-    - provider_type: remote::model-context-protocol
-image_type: venv
-additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
--- a/llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml
@ -1,157 +0,0 @@
-version: 2
-image_name: meta-reference-gpu
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: meta-reference-inference
-    provider_type: inline::meta-reference
-    config:
-      model: ${env.INFERENCE_MODEL}
-      checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:=null}
-      quantization:
-        type: ${env.QUANTIZATION_TYPE:=bf16}
-      model_parallel_size: ${env.MODEL_PARALLEL_SIZE:=0}
-      max_batch_size: ${env.MAX_BATCH_SIZE:=1}
-      max_seq_len: ${env.MAX_SEQ_LEN:=4096}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-  - provider_id: meta-reference-safety
-    provider_type: inline::meta-reference
-    config:
-      model: ${env.SAFETY_MODEL}
-      checkpoint_dir: ${env.SAFETY_CHECKPOINT_DIR:=null}
-      quantization:
-        type: ${env.QUANTIZATION_TYPE:=bf16}
-      model_parallel_size: ${env.MODEL_PARALLEL_SIZE:=0}
-      max_batch_size: ${env.MAX_BATCH_SIZE:=1}
-      max_seq_len: ${env.MAX_SEQ_LEN:=4096}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      persistence:
-        namespace: vector_io::faiss
-        backend: kv_default
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence:
-        agent_state:
-          namespace: agents
-          backend: kv_default
-        responses:
-          table_name: responses
-          backend: sql_default
-          max_write_queue_size: 10000
-          num_writers: 4
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-storage:
-  backends:
-    kv_default:
-      type: kv_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/kvstore.db
-    sql_default:
-      type: sql_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/sql_store.db
-  stores:
-    metadata:
-      namespace: registry
-      backend: kv_default
-    inference:
-      table_name: inference_store
-      backend: sql_default
-      max_write_queue_size: 10000
-      num_writers: 4
-    conversations:
-      table_name: openai_conversations
-      backend: sql_default
-    prompts:
-      namespace: prompts
-      backend: kv_default
-registered_resources:
-  models:
-  - metadata: {}
-    model_id: ${env.INFERENCE_MODEL}
-    provider_id: meta-reference-inference
-    model_type: llm
-  - metadata: {}
-    model_id: ${env.SAFETY_MODEL}
-    provider_id: meta-reference-safety
-    model_type: llm
-  - metadata:
-      embedding_dimension: 768
-    model_id: nomic-embed-text-v1.5
-    provider_id: sentence-transformers
-    model_type: embedding
-  shields:
-  - shield_id: ${env.SAFETY_MODEL}
-  vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
-  tool_groups:
-  - toolgroup_id: builtin::websearch
-    provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
-server:
-  port: 8321
-telemetry:
-  enabled: true
--- a/llama_stack/distributions/meta-reference-gpu/run.yaml
+++ b/llama_stack/distributions/meta-reference-gpu/run.yaml
@ -1,142 +0,0 @@
-version: 2
-image_name: meta-reference-gpu
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: meta-reference-inference
-    provider_type: inline::meta-reference
-    config:
-      model: ${env.INFERENCE_MODEL}
-      checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:=null}
-      quantization:
-        type: ${env.QUANTIZATION_TYPE:=bf16}
-      model_parallel_size: ${env.MODEL_PARALLEL_SIZE:=0}
-      max_batch_size: ${env.MAX_BATCH_SIZE:=1}
-      max_seq_len: ${env.MAX_SEQ_LEN:=4096}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      persistence:
-        namespace: vector_io::faiss
-        backend: kv_default
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence:
-        agent_state:
-          namespace: agents
-          backend: kv_default
-        responses:
-          table_name: responses
-          backend: sql_default
-          max_write_queue_size: 10000
-          num_writers: 4
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-storage:
-  backends:
-    kv_default:
-      type: kv_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/kvstore.db
-    sql_default:
-      type: sql_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/sql_store.db
-  stores:
-    metadata:
-      namespace: registry
-      backend: kv_default
-    inference:
-      table_name: inference_store
-      backend: sql_default
-      max_write_queue_size: 10000
-      num_writers: 4
-    conversations:
-      table_name: openai_conversations
-      backend: sql_default
-    prompts:
-      namespace: prompts
-      backend: kv_default
-registered_resources:
-  models:
-  - metadata: {}
-    model_id: ${env.INFERENCE_MODEL}
-    provider_id: meta-reference-inference
-    model_type: llm
-  - metadata:
-      embedding_dimension: 768
-    model_id: nomic-embed-text-v1.5
-    provider_id: sentence-transformers
-    model_type: embedding
-  shields: []
-  vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
-  tool_groups:
-  - toolgroup_id: builtin::websearch
-    provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
-server:
-  port: 8321
-telemetry:
-  enabled: true
--- a/llama_stack/distributions/nvidia/build.yaml
+++ b/llama_stack/distributions/nvidia/build.yaml
@ -1,29 +0,0 @@
-version: 2
-distribution_spec:
-  description: Use NVIDIA NIM for running LLM inference, evaluation and safety
-  providers:
-    inference:
-    - provider_type: remote::nvidia
-    vector_io:
-    - provider_type: inline::faiss
-    safety:
-    - provider_type: remote::nvidia
-    agents:
-    - provider_type: inline::meta-reference
-    eval:
-    - provider_type: remote::nvidia
-    post_training:
-    - provider_type: remote::nvidia
-    datasetio:
-    - provider_type: inline::localfs
-    - provider_type: remote::nvidia
-    scoring:
-    - provider_type: inline::basic
-    tool_runtime:
-    - provider_type: inline::rag-runtime
-    files:
-    - provider_type: inline::localfs
-image_type: venv
-additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
--- a/llama_stack/distributions/nvidia/run-with-safety.yaml
+++ b/llama_stack/distributions/nvidia/run-with-safety.yaml
@ -1,140 +0,0 @@
-version: 2
-image_name: nvidia
-apis:
- agents
- datasetio
- eval
- files
- inference
- post_training
- safety
- scoring
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
-      api_key: ${env.NVIDIA_API_KEY:=}
-      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      guardrails_service_url: ${env.GUARDRAILS_SERVICE_URL:=http://localhost:7331}
-      config_id: ${env.NVIDIA_GUARDRAILS_CONFIG_ID:=self-check}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      persistence:
-        namespace: vector_io::faiss
-        backend: kv_default
-  safety:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      guardrails_service_url: ${env.GUARDRAILS_SERVICE_URL:=http://localhost:7331}
-      config_id: ${env.NVIDIA_GUARDRAILS_CONFIG_ID:=self-check}
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence:
-        agent_state:
-          namespace: agents
-          backend: kv_default
-        responses:
-          table_name: responses
-          backend: sql_default
-          max_write_queue_size: 10000
-          num_writers: 4
-  eval:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}
-  post_training:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      api_key: ${env.NVIDIA_API_KEY:=}
-      dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
-      project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
-      customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:=http://nemo.test}
-  datasetio:
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      api_key: ${env.NVIDIA_API_KEY:=}
-      dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
-      project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
-      datasets_url: ${env.NVIDIA_DATASETS_URL:=http://nemo.test}
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  tool_runtime:
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-  files:
-  - provider_id: meta-reference-files
-    provider_type: inline::localfs
-    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/nvidia/files}
-      metadata_store:
-        table_name: files_metadata
-        backend: sql_default
-storage:
-  backends:
-    kv_default:
-      type: kv_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/kvstore.db
-    sql_default:
-      type: sql_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/sql_store.db
-  stores:
-    metadata:
-      namespace: registry
-      backend: kv_default
-    inference:
-      table_name: inference_store
-      backend: sql_default
-      max_write_queue_size: 10000
-      num_writers: 4
-    conversations:
-      table_name: openai_conversations
-      backend: sql_default
-    prompts:
-      namespace: prompts
-      backend: kv_default
-registered_resources:
-  models:
-  - metadata: {}
-    model_id: ${env.INFERENCE_MODEL}
-    provider_id: nvidia
-    model_type: llm
-  - metadata: {}
-    model_id: ${env.SAFETY_MODEL}
-    provider_id: nvidia
-    model_type: llm
-  shields:
-  - shield_id: ${env.SAFETY_MODEL}
-    provider_id: nvidia
-  vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
-  tool_groups:
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
-server:
-  port: 8321
-telemetry:
-  enabled: true
--- a/llama_stack/distributions/nvidia/run.yaml
+++ b/llama_stack/distributions/nvidia/run.yaml
@ -1,119 +0,0 @@
-version: 2
-image_name: nvidia
-apis:
- agents
- datasetio
- eval
- files
- inference
- post_training
- safety
- scoring
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
-      api_key: ${env.NVIDIA_API_KEY:=}
-      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      persistence:
-        namespace: vector_io::faiss
-        backend: kv_default
-  safety:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      guardrails_service_url: ${env.GUARDRAILS_SERVICE_URL:=http://localhost:7331}
-      config_id: ${env.NVIDIA_GUARDRAILS_CONFIG_ID:=self-check}
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence:
-        agent_state:
-          namespace: agents
-          backend: kv_default
-        responses:
-          table_name: responses
-          backend: sql_default
-          max_write_queue_size: 10000
-          num_writers: 4
-  eval:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}
-  post_training:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      api_key: ${env.NVIDIA_API_KEY:=}
-      dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
-      project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
-      customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:=http://nemo.test}
-  datasetio:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      api_key: ${env.NVIDIA_API_KEY:=}
-      dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
-      project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
-      datasets_url: ${env.NVIDIA_DATASETS_URL:=http://nemo.test}
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  tool_runtime:
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-  files:
-  - provider_id: meta-reference-files
-    provider_type: inline::localfs
-    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/nvidia/files}
-      metadata_store:
-        table_name: files_metadata
-        backend: sql_default
-storage:
-  backends:
-    kv_default:
-      type: kv_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/kvstore.db
-    sql_default:
-      type: sql_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/sql_store.db
-  stores:
-    metadata:
-      namespace: registry
-      backend: kv_default
-    inference:
-      table_name: inference_store
-      backend: sql_default
-      max_write_queue_size: 10000
-      num_writers: 4
-    conversations:
-      table_name: openai_conversations
-      backend: sql_default
-    prompts:
-      namespace: prompts
-      backend: kv_default
-registered_resources:
-  models: []
-  shields: []
-  vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
-  tool_groups:
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
-server:
-  port: 8321
-telemetry:
-  enabled: true
--- a/llama_stack/distributions/open-benchmark/build.yaml
+++ b/llama_stack/distributions/open-benchmark/build.yaml
@ -1,36 +0,0 @@
-version: 2
-distribution_spec:
-  description: Distribution for running open benchmarks
-  providers:
-    inference:
-    - provider_type: remote::openai
-    - provider_type: remote::anthropic
-    - provider_type: remote::gemini
-    - provider_type: remote::groq
-    - provider_type: remote::together
-    vector_io:
-    - provider_type: inline::sqlite-vec
-    - provider_type: remote::chromadb
-    - provider_type: remote::pgvector
-    safety:
-    - provider_type: inline::llama-guard
-    agents:
-    - provider_type: inline::meta-reference
-    eval:
-    - provider_type: inline::meta-reference
-    datasetio:
-    - provider_type: remote::huggingface
-    - provider_type: inline::localfs
-    scoring:
-    - provider_type: inline::basic
-    - provider_type: inline::llm-as-judge
-    - provider_type: inline::braintrust
-    tool_runtime:
-    - provider_type: remote::brave-search
-    - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
-    - provider_type: remote::model-context-protocol
-image_type: venv
-additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
--- a/llama_stack/distributions/open-benchmark/run.yaml
+++ b/llama_stack/distributions/open-benchmark/run.yaml
@ -1,255 +0,0 @@
-version: 2
-image_name: open-benchmark
-apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: openai
-    provider_type: remote::openai
-    config:
-      api_key: ${env.OPENAI_API_KEY:=}
-      base_url: ${env.OPENAI_BASE_URL:=https://api.openai.com/v1}
-  - provider_id: anthropic
-    provider_type: remote::anthropic
-    config:
-      api_key: ${env.ANTHROPIC_API_KEY:=}
-  - provider_id: gemini
-    provider_type: remote::gemini
-    config:
-      api_key: ${env.GEMINI_API_KEY:=}
-  - provider_id: groq
-    provider_type: remote::groq
-    config:
-      url: https://api.groq.com
-      api_key: ${env.GROQ_API_KEY:=}
-  - provider_id: together
-    provider_type: remote::together
-    config:
-      url: https://api.together.xyz/v1
-      api_key: ${env.TOGETHER_API_KEY:=}
-  vector_io:
-  - provider_id: sqlite-vec
-    provider_type: inline::sqlite-vec
-    config:
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/open-benchmark}/sqlite_vec.db
-      persistence:
-        namespace: vector_io::sqlite_vec
-        backend: kv_default
-  - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
-    provider_type: remote::chromadb
-    config:
-      url: ${env.CHROMADB_URL:=}
-      persistence:
-        namespace: vector_io::chroma_remote
-        backend: kv_default
-  - provider_id: ${env.ENABLE_PGVECTOR:+pgvector}
-    provider_type: remote::pgvector
-    config:
-      host: ${env.PGVECTOR_HOST:=localhost}
-      port: ${env.PGVECTOR_PORT:=5432}
-      db: ${env.PGVECTOR_DB:=}
-      user: ${env.PGVECTOR_USER:=}
-      password: ${env.PGVECTOR_PASSWORD:=}
-      persistence:
-        namespace: vector_io::pgvector
-        backend: kv_default
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence:
-        agent_state:
-          namespace: agents
-          backend: kv_default
-        responses:
-          table_name: responses
-          backend: sql_default
-          max_write_queue_size: 10000
-          num_writers: 4
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-storage:
-  backends:
-    kv_default:
-      type: kv_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/open-benchmark}/kvstore.db
-    sql_default:
-      type: sql_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/open-benchmark}/sql_store.db
-  stores:
-    metadata:
-      namespace: registry
-      backend: kv_default
-    inference:
-      table_name: inference_store
-      backend: sql_default
-      max_write_queue_size: 10000
-      num_writers: 4
-    conversations:
-      table_name: openai_conversations
-      backend: sql_default
-    prompts:
-      namespace: prompts
-      backend: kv_default
-registered_resources:
-  models:
-  - metadata: {}
-    model_id: gpt-4o
-    provider_id: openai
-    provider_model_id: gpt-4o
-    model_type: llm
-  - metadata: {}
-    model_id: claude-3-5-sonnet-latest
-    provider_id: anthropic
-    provider_model_id: claude-3-5-sonnet-latest
-    model_type: llm
-  - metadata: {}
-    model_id: gemini/gemini-1.5-flash
-    provider_id: gemini
-    provider_model_id: gemini/gemini-1.5-flash
-    model_type: llm
-  - metadata: {}
-    model_id: meta-llama/Llama-3.3-70B-Instruct
-    provider_id: groq
-    provider_model_id: groq/llama-3.3-70b-versatile
-    model_type: llm
-  - metadata: {}
-    model_id: meta-llama/Llama-3.1-405B-Instruct
-    provider_id: together
-    provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
-    model_type: llm
-  shields:
-  - shield_id: meta-llama/Llama-Guard-3-8B
-  vector_dbs: []
-  datasets:
-  - purpose: eval/messages-answer
-    source:
-      type: uri
-      uri: huggingface://datasets/llamastack/simpleqa?split=train
-    metadata: {}
-    dataset_id: simpleqa
-  - purpose: eval/messages-answer
-    source:
-      type: uri
-      uri: huggingface://datasets/llamastack/mmlu_cot?split=test&name=all
-    metadata: {}
-    dataset_id: mmlu_cot
-  - purpose: eval/messages-answer
-    source:
-      type: uri
-      uri: huggingface://datasets/llamastack/gpqa_0shot_cot?split=test&name=gpqa_main
-    metadata: {}
-    dataset_id: gpqa_cot
-  - purpose: eval/messages-answer
-    source:
-      type: uri
-      uri: huggingface://datasets/llamastack/math_500?split=test
-    metadata: {}
-    dataset_id: math_500
-  - purpose: eval/messages-answer
-    source:
-      type: uri
-      uri: huggingface://datasets/llamastack/IfEval?split=train
-    metadata: {}
-    dataset_id: ifeval
-  - purpose: eval/messages-answer
-    source:
-      type: uri
-      uri: huggingface://datasets/llamastack/docvqa?split=val
-    metadata: {}
-    dataset_id: docvqa
-  scoring_fns: []
-  benchmarks:
-  - dataset_id: simpleqa
-    scoring_functions:
-    - llm-as-judge::405b-simpleqa
-    metadata: {}
-    benchmark_id: meta-reference-simpleqa
-  - dataset_id: mmlu_cot
-    scoring_functions:
-    - basic::regex_parser_multiple_choice_answer
-    metadata: {}
-    benchmark_id: meta-reference-mmlu-cot
-  - dataset_id: gpqa_cot
-    scoring_functions:
-    - basic::regex_parser_multiple_choice_answer
-    metadata: {}
-    benchmark_id: meta-reference-gpqa-cot
-  - dataset_id: math_500
-    scoring_functions:
-    - basic::regex_parser_math_response
-    metadata: {}
-    benchmark_id: meta-reference-math-500
-  - dataset_id: ifeval
-    scoring_functions:
-    - basic::ifeval
-    metadata: {}
-    benchmark_id: meta-reference-ifeval
-  - dataset_id: docvqa
-    scoring_functions:
-    - basic::docvqa
-    metadata: {}
-    benchmark_id: meta-reference-docvqa
-  tool_groups:
-  - toolgroup_id: builtin::websearch
-    provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
-server:
-  port: 8321
-telemetry:
-  enabled: true
--- a/llama_stack/distributions/postgres-demo/build.yaml
+++ b/llama_stack/distributions/postgres-demo/build.yaml
@ -1,23 +0,0 @@
-version: 2
-distribution_spec:
-  description: Quick start template for running Llama Stack with several popular providers
-  providers:
-    inference:
-    - provider_type: remote::vllm
-    - provider_type: inline::sentence-transformers
-    vector_io:
-    - provider_type: remote::chromadb
-    safety:
-    - provider_type: inline::llama-guard
-    agents:
-    - provider_type: inline::meta-reference
-    tool_runtime:
-    - provider_type: remote::brave-search
-    - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
-    - provider_type: remote::model-context-protocol
-image_type: venv
-additional_pip_packages:
- asyncpg
- psycopg2-binary
- sqlalchemy[asyncio]
--- a/llama_stack/distributions/postgres-demo/run.yaml
+++ b/llama_stack/distributions/postgres-demo/run.yaml
@ -1,118 +0,0 @@
-version: 2
-image_name: postgres-demo
-apis:
- agents
- inference
- safety
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: vllm-inference
-    provider_type: remote::vllm
-    config:
-      url: ${env.VLLM_URL:=http://localhost:8000/v1}
-      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-      api_token: ${env.VLLM_API_TOKEN:=fake}
-      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-  vector_io:
-  - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
-    provider_type: remote::chromadb
-    config:
-      url: ${env.CHROMADB_URL:=}
-      persistence:
-        namespace: vector_io::chroma_remote
-        backend: kv_default
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence:
-        agent_state:
-          namespace: agents
-          backend: kv_default
-        responses:
-          table_name: responses
-          backend: sql_default
-          max_write_queue_size: 10000
-          num_writers: 4
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-storage:
-  backends:
-    kv_default:
-      type: kv_postgres
-      host: ${env.POSTGRES_HOST:=localhost}
-      port: ${env.POSTGRES_PORT:=5432}
-      db: ${env.POSTGRES_DB:=llamastack}
-      user: ${env.POSTGRES_USER:=llamastack}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
-      table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
-    sql_default:
-      type: sql_postgres
-      host: ${env.POSTGRES_HOST:=localhost}
-      port: ${env.POSTGRES_PORT:=5432}
-      db: ${env.POSTGRES_DB:=llamastack}
-      user: ${env.POSTGRES_USER:=llamastack}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
-  stores:
-    metadata:
-      namespace: registry
-      backend: kv_default
-    inference:
-      table_name: inference_store
-      backend: sql_default
-      max_write_queue_size: 10000
-      num_writers: 4
-    conversations:
-      table_name: openai_conversations
-      backend: sql_default
-    prompts:
-      namespace: prompts
-      backend: kv_default
-registered_resources:
-  models:
-  - metadata: {}
-    model_id: ${env.INFERENCE_MODEL}
-    provider_id: vllm-inference
-    model_type: llm
-  - metadata:
-      embedding_dimension: 768
-    model_id: nomic-embed-text-v1.5
-    provider_id: sentence-transformers
-    model_type: embedding
-  shields:
-  - shield_id: meta-llama/Llama-Guard-3-8B
-  vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
-  tool_groups:
-  - toolgroup_id: builtin::websearch
-    provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
-server:
-  port: 8321
-telemetry:
-  enabled: true
--- a/llama_stack/distributions/starter-gpu/build.yaml
+++ b/llama_stack/distributions/starter-gpu/build.yaml
@ -1,60 +0,0 @@
-version: 2
-distribution_spec:
-  description: Quick start template for running Llama Stack with several popular providers.
-    This distribution is intended for GPU-enabled environments.
-  providers:
-    inference:
-    - provider_type: remote::cerebras
-    - provider_type: remote::ollama
-    - provider_type: remote::vllm
-    - provider_type: remote::tgi
-    - provider_type: remote::fireworks
-    - provider_type: remote::together
-    - provider_type: remote::bedrock
-    - provider_type: remote::nvidia
-    - provider_type: remote::openai
-    - provider_type: remote::anthropic
-    - provider_type: remote::gemini
-    - provider_type: remote::vertexai
-    - provider_type: remote::groq
-    - provider_type: remote::sambanova
-    - provider_type: remote::azure
-    - provider_type: inline::sentence-transformers
-    vector_io:
-    - provider_type: inline::faiss
-    - provider_type: inline::sqlite-vec
-    - provider_type: inline::milvus
-    - provider_type: remote::chromadb
-    - provider_type: remote::pgvector
-    - provider_type: remote::qdrant
-    - provider_type: remote::weaviate
-    files:
-    - provider_type: inline::localfs
-    safety:
-    - provider_type: inline::llama-guard
-    - provider_type: inline::code-scanner
-    agents:
-    - provider_type: inline::meta-reference
-    post_training:
-    - provider_type: inline::huggingface-gpu
-    eval:
-    - provider_type: inline::meta-reference
-    datasetio:
-    - provider_type: remote::huggingface
-    - provider_type: inline::localfs
-    scoring:
-    - provider_type: inline::basic
-    - provider_type: inline::llm-as-judge
-    - provider_type: inline::braintrust
-    tool_runtime:
-    - provider_type: remote::brave-search
-    - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
-    - provider_type: remote::model-context-protocol
-    batches:
-    - provider_type: inline::reference
-image_type: venv
-additional_pip_packages:
- aiosqlite
- asyncpg
- sqlalchemy[asyncio]
--- a/llama_stack/distributions/starter-gpu/run.yaml
+++ b/llama_stack/distributions/starter-gpu/run.yaml
@ -1,284 +0,0 @@
-version: 2
-image_name: starter-gpu
-apis:
- agents
- batches
- datasetio
- eval
- files
- inference
- post_training
- safety
- scoring
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
-    provider_type: remote::cerebras
-    config:
-      base_url: https://api.cerebras.ai
-      api_key: ${env.CEREBRAS_API_KEY:=}
-  - provider_id: ${env.OLLAMA_URL:+ollama}
-    provider_type: remote::ollama
-    config:
-      url: ${env.OLLAMA_URL:=http://localhost:11434}
-  - provider_id: ${env.VLLM_URL:+vllm}
-    provider_type: remote::vllm
-    config:
-      url: ${env.VLLM_URL:=}
-      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-      api_token: ${env.VLLM_API_TOKEN:=fake}
-      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-  - provider_id: ${env.TGI_URL:+tgi}
-    provider_type: remote::tgi
-    config:
-      url: ${env.TGI_URL:=}
-  - provider_id: fireworks
-    provider_type: remote::fireworks
-    config:
-      url: https://api.fireworks.ai/inference/v1
-      api_key: ${env.FIREWORKS_API_KEY:=}
-  - provider_id: together
-    provider_type: remote::together
-    config:
-      url: https://api.together.xyz/v1
-      api_key: ${env.TOGETHER_API_KEY:=}
-  - provider_id: bedrock
-    provider_type: remote::bedrock
-  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
-    provider_type: remote::nvidia
-    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
-      api_key: ${env.NVIDIA_API_KEY:=}
-      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
-  - provider_id: openai
-    provider_type: remote::openai
-    config:
-      api_key: ${env.OPENAI_API_KEY:=}
-      base_url: ${env.OPENAI_BASE_URL:=https://api.openai.com/v1}
-  - provider_id: anthropic
-    provider_type: remote::anthropic
-    config:
-      api_key: ${env.ANTHROPIC_API_KEY:=}
-  - provider_id: gemini
-    provider_type: remote::gemini
-    config:
-      api_key: ${env.GEMINI_API_KEY:=}
-  - provider_id: ${env.VERTEX_AI_PROJECT:+vertexai}
-    provider_type: remote::vertexai
-    config:
-      project: ${env.VERTEX_AI_PROJECT:=}
-      location: ${env.VERTEX_AI_LOCATION:=us-central1}
-  - provider_id: groq
-    provider_type: remote::groq
-    config:
-      url: https://api.groq.com
-      api_key: ${env.GROQ_API_KEY:=}
-  - provider_id: sambanova
-    provider_type: remote::sambanova
-    config:
-      url: https://api.sambanova.ai/v1
-      api_key: ${env.SAMBANOVA_API_KEY:=}
-  - provider_id: ${env.AZURE_API_KEY:+azure}
-    provider_type: remote::azure
-    config:
-      api_key: ${env.AZURE_API_KEY:=}
-      api_base: ${env.AZURE_API_BASE:=}
-      api_version: ${env.AZURE_API_VERSION:=}
-      api_type: ${env.AZURE_API_TYPE:=}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      persistence:
-        namespace: vector_io::faiss
-        backend: kv_default
-  - provider_id: sqlite-vec
-    provider_type: inline::sqlite-vec
-    config:
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/sqlite_vec.db
-      persistence:
-        namespace: vector_io::sqlite_vec
-        backend: kv_default
-  - provider_id: ${env.MILVUS_URL:+milvus}
-    provider_type: inline::milvus
-    config:
-      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter-gpu}/milvus.db
-      persistence:
-        namespace: vector_io::milvus
-        backend: kv_default
-  - provider_id: ${env.CHROMADB_URL:+chromadb}
-    provider_type: remote::chromadb
-    config:
-      url: ${env.CHROMADB_URL:=}
-      persistence:
-        namespace: vector_io::chroma_remote
-        backend: kv_default
-  - provider_id: ${env.PGVECTOR_DB:+pgvector}
-    provider_type: remote::pgvector
-    config:
-      host: ${env.PGVECTOR_HOST:=localhost}
-      port: ${env.PGVECTOR_PORT:=5432}
-      db: ${env.PGVECTOR_DB:=}
-      user: ${env.PGVECTOR_USER:=}
-      password: ${env.PGVECTOR_PASSWORD:=}
-      persistence:
-        namespace: vector_io::pgvector
-        backend: kv_default
-  - provider_id: ${env.QDRANT_URL:+qdrant}
-    provider_type: remote::qdrant
-    config:
-      api_key: ${env.QDRANT_API_KEY:=}
-      persistence:
-        namespace: vector_io::qdrant_remote
-        backend: kv_default
-  - provider_id: ${env.WEAVIATE_CLUSTER_URL:+weaviate}
-    provider_type: remote::weaviate
-    config:
-      weaviate_api_key: null
-      weaviate_cluster_url: ${env.WEAVIATE_CLUSTER_URL:=localhost:8080}
-      persistence:
-        namespace: vector_io::weaviate
-        backend: kv_default
-  files:
-  - provider_id: meta-reference-files
-    provider_type: inline::localfs
-    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter-gpu/files}
-      metadata_store:
-        table_name: files_metadata
-        backend: sql_default
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  - provider_id: code-scanner
-    provider_type: inline::code-scanner
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence:
-        agent_state:
-          namespace: agents
-          backend: kv_default
-        responses:
-          table_name: responses
-          backend: sql_default
-          max_write_queue_size: 10000
-          num_writers: 4
-  post_training:
-  - provider_id: huggingface-gpu
-    provider_type: inline::huggingface-gpu
-    config:
-      checkpoint_format: huggingface
-      distributed_backend: null
-      device: cpu
-      dpo_output_dir: ~/.llama/distributions/starter-gpu/dpo_output
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-  batches:
-  - provider_id: reference
-    provider_type: inline::reference
-    config:
-      kvstore:
-        namespace: batches
-        backend: kv_default
-storage:
-  backends:
-    kv_default:
-      type: kv_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/kvstore.db
-    sql_default:
-      type: sql_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/sql_store.db
-  stores:
-    metadata:
-      namespace: registry
-      backend: kv_default
-    inference:
-      table_name: inference_store
-      backend: sql_default
-      max_write_queue_size: 10000
-      num_writers: 4
-    conversations:
-      table_name: openai_conversations
-      backend: sql_default
-    prompts:
-      namespace: prompts
-      backend: kv_default
-registered_resources:
-  models: []
-  shields:
-  - shield_id: llama-guard
-    provider_id: ${env.SAFETY_MODEL:+llama-guard}
-    provider_shield_id: ${env.SAFETY_MODEL:=}
-  - shield_id: code-scanner
-    provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
-    provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
-  vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
-  tool_groups:
-  - toolgroup_id: builtin::websearch
-    provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
-server:
-  port: 8321
-telemetry:
-  enabled: true
-vector_stores:
-  default_provider_id: faiss
-  default_embedding_model:
-    provider_id: sentence-transformers
-    model_id: nomic-ai/nomic-embed-text-v1.5
-safety:
-  default_shield_id: llama-guard
--- a/llama_stack/distributions/starter/build.yaml
+++ b/llama_stack/distributions/starter/build.yaml
@ -1,60 +0,0 @@
-version: 2
-distribution_spec:
-  description: Quick start template for running Llama Stack with several popular providers.
-    This distribution is intended for CPU-only environments.
-  providers:
-    inference:
-    - provider_type: remote::cerebras
-    - provider_type: remote::ollama
-    - provider_type: remote::vllm
-    - provider_type: remote::tgi
-    - provider_type: remote::fireworks
-    - provider_type: remote::together
-    - provider_type: remote::bedrock
-    - provider_type: remote::nvidia
-    - provider_type: remote::openai
-    - provider_type: remote::anthropic
-    - provider_type: remote::gemini
-    - provider_type: remote::vertexai
-    - provider_type: remote::groq
-    - provider_type: remote::sambanova
-    - provider_type: remote::azure
-    - provider_type: inline::sentence-transformers
-    vector_io:
-    - provider_type: inline::faiss
-    - provider_type: inline::sqlite-vec
-    - provider_type: inline::milvus
-    - provider_type: remote::chromadb
-    - provider_type: remote::pgvector
-    - provider_type: remote::qdrant
-    - provider_type: remote::weaviate
-    files:
-    - provider_type: inline::localfs
-    safety:
-    - provider_type: inline::llama-guard
-    - provider_type: inline::code-scanner
-    agents:
-    - provider_type: inline::meta-reference
-    post_training:
-    - provider_type: inline::torchtune-cpu
-    eval:
-    - provider_type: inline::meta-reference
-    datasetio:
-    - provider_type: remote::huggingface
-    - provider_type: inline::localfs
-    scoring:
-    - provider_type: inline::basic
-    - provider_type: inline::llm-as-judge
-    - provider_type: inline::braintrust
-    tool_runtime:
-    - provider_type: remote::brave-search
-    - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
-    - provider_type: remote::model-context-protocol
-    batches:
-    - provider_type: inline::reference
-image_type: venv
-additional_pip_packages:
- aiosqlite
- asyncpg
- sqlalchemy[asyncio]
--- a/llama_stack/distributions/starter/run.yaml
+++ b/llama_stack/distributions/starter/run.yaml
@ -1,281 +0,0 @@
-version: 2
-image_name: starter
-apis:
- agents
- batches
- datasetio
- eval
- files
- inference
- post_training
- safety
- scoring
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
-    provider_type: remote::cerebras
-    config:
-      base_url: https://api.cerebras.ai
-      api_key: ${env.CEREBRAS_API_KEY:=}
-  - provider_id: ${env.OLLAMA_URL:+ollama}
-    provider_type: remote::ollama
-    config:
-      url: ${env.OLLAMA_URL:=http://localhost:11434}
-  - provider_id: ${env.VLLM_URL:+vllm}
-    provider_type: remote::vllm
-    config:
-      url: ${env.VLLM_URL:=}
-      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-      api_token: ${env.VLLM_API_TOKEN:=fake}
-      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-  - provider_id: ${env.TGI_URL:+tgi}
-    provider_type: remote::tgi
-    config:
-      url: ${env.TGI_URL:=}
-  - provider_id: fireworks
-    provider_type: remote::fireworks
-    config:
-      url: https://api.fireworks.ai/inference/v1
-      api_key: ${env.FIREWORKS_API_KEY:=}
-  - provider_id: together
-    provider_type: remote::together
-    config:
-      url: https://api.together.xyz/v1
-      api_key: ${env.TOGETHER_API_KEY:=}
-  - provider_id: bedrock
-    provider_type: remote::bedrock
-  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
-    provider_type: remote::nvidia
-    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
-      api_key: ${env.NVIDIA_API_KEY:=}
-      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
-  - provider_id: openai
-    provider_type: remote::openai
-    config:
-      api_key: ${env.OPENAI_API_KEY:=}
-      base_url: ${env.OPENAI_BASE_URL:=https://api.openai.com/v1}
-  - provider_id: anthropic
-    provider_type: remote::anthropic
-    config:
-      api_key: ${env.ANTHROPIC_API_KEY:=}
-  - provider_id: gemini
-    provider_type: remote::gemini
-    config:
-      api_key: ${env.GEMINI_API_KEY:=}
-  - provider_id: ${env.VERTEX_AI_PROJECT:+vertexai}
-    provider_type: remote::vertexai
-    config:
-      project: ${env.VERTEX_AI_PROJECT:=}
-      location: ${env.VERTEX_AI_LOCATION:=us-central1}
-  - provider_id: groq
-    provider_type: remote::groq
-    config:
-      url: https://api.groq.com
-      api_key: ${env.GROQ_API_KEY:=}
-  - provider_id: sambanova
-    provider_type: remote::sambanova
-    config:
-      url: https://api.sambanova.ai/v1
-      api_key: ${env.SAMBANOVA_API_KEY:=}
-  - provider_id: ${env.AZURE_API_KEY:+azure}
-    provider_type: remote::azure
-    config:
-      api_key: ${env.AZURE_API_KEY:=}
-      api_base: ${env.AZURE_API_BASE:=}
-      api_version: ${env.AZURE_API_VERSION:=}
-      api_type: ${env.AZURE_API_TYPE:=}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      persistence:
-        namespace: vector_io::faiss
-        backend: kv_default
-  - provider_id: sqlite-vec
-    provider_type: inline::sqlite-vec
-    config:
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db
-      persistence:
-        namespace: vector_io::sqlite_vec
-        backend: kv_default
-  - provider_id: ${env.MILVUS_URL:+milvus}
-    provider_type: inline::milvus
-    config:
-      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db
-      persistence:
-        namespace: vector_io::milvus
-        backend: kv_default
-  - provider_id: ${env.CHROMADB_URL:+chromadb}
-    provider_type: remote::chromadb
-    config:
-      url: ${env.CHROMADB_URL:=}
-      persistence:
-        namespace: vector_io::chroma_remote
-        backend: kv_default
-  - provider_id: ${env.PGVECTOR_DB:+pgvector}
-    provider_type: remote::pgvector
-    config:
-      host: ${env.PGVECTOR_HOST:=localhost}
-      port: ${env.PGVECTOR_PORT:=5432}
-      db: ${env.PGVECTOR_DB:=}
-      user: ${env.PGVECTOR_USER:=}
-      password: ${env.PGVECTOR_PASSWORD:=}
-      persistence:
-        namespace: vector_io::pgvector
-        backend: kv_default
-  - provider_id: ${env.QDRANT_URL:+qdrant}
-    provider_type: remote::qdrant
-    config:
-      api_key: ${env.QDRANT_API_KEY:=}
-      persistence:
-        namespace: vector_io::qdrant_remote
-        backend: kv_default
-  - provider_id: ${env.WEAVIATE_CLUSTER_URL:+weaviate}
-    provider_type: remote::weaviate
-    config:
-      weaviate_api_key: null
-      weaviate_cluster_url: ${env.WEAVIATE_CLUSTER_URL:=localhost:8080}
-      persistence:
-        namespace: vector_io::weaviate
-        backend: kv_default
-  files:
-  - provider_id: meta-reference-files
-    provider_type: inline::localfs
-    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
-      metadata_store:
-        table_name: files_metadata
-        backend: sql_default
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  - provider_id: code-scanner
-    provider_type: inline::code-scanner
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence:
-        agent_state:
-          namespace: agents
-          backend: kv_default
-        responses:
-          table_name: responses
-          backend: sql_default
-          max_write_queue_size: 10000
-          num_writers: 4
-  post_training:
-  - provider_id: torchtune-cpu
-    provider_type: inline::torchtune-cpu
-    config:
-      checkpoint_format: meta
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-  batches:
-  - provider_id: reference
-    provider_type: inline::reference
-    config:
-      kvstore:
-        namespace: batches
-        backend: kv_default
-storage:
-  backends:
-    kv_default:
-      type: kv_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/kvstore.db
-    sql_default:
-      type: sql_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sql_store.db
-  stores:
-    metadata:
-      namespace: registry
-      backend: kv_default
-    inference:
-      table_name: inference_store
-      backend: sql_default
-      max_write_queue_size: 10000
-      num_writers: 4
-    conversations:
-      table_name: openai_conversations
-      backend: sql_default
-    prompts:
-      namespace: prompts
-      backend: kv_default
-registered_resources:
-  models: []
-  shields:
-  - shield_id: llama-guard
-    provider_id: ${env.SAFETY_MODEL:+llama-guard}
-    provider_shield_id: ${env.SAFETY_MODEL:=}
-  - shield_id: code-scanner
-    provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
-    provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
-  vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
-  tool_groups:
-  - toolgroup_id: builtin::websearch
-    provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
-server:
-  port: 8321
-telemetry:
-  enabled: true
-vector_stores:
-  default_provider_id: faiss
-  default_embedding_model:
-    provider_id: sentence-transformers
-    model_id: nomic-ai/nomic-embed-text-v1.5
-safety:
-  default_shield_id: llama-guard
--- a/llama_stack/distributions/watsonx/build.yaml
+++ b/llama_stack/distributions/watsonx/build.yaml
@ -1,33 +0,0 @@
-version: 2
-distribution_spec:
-  description: Use watsonx for running LLM inference
-  providers:
-    inference:
-    - provider_type: remote::watsonx
-    - provider_type: inline::sentence-transformers
-    vector_io:
-    - provider_type: inline::faiss
-    safety:
-    - provider_type: inline::llama-guard
-    agents:
-    - provider_type: inline::meta-reference
-    eval:
-    - provider_type: inline::meta-reference
-    datasetio:
-    - provider_type: remote::huggingface
-    - provider_type: inline::localfs
-    scoring:
-    - provider_type: inline::basic
-    - provider_type: inline::llm-as-judge
-    - provider_type: inline::braintrust
-    tool_runtime:
-    - provider_type: remote::brave-search
-    - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
-    - provider_type: remote::model-context-protocol
-    files:
-    - provider_type: inline::localfs
-image_type: venv
-additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
--- a/llama_stack/distributions/watsonx/run.yaml
+++ b/llama_stack/distributions/watsonx/run.yaml
@ -1,136 +0,0 @@
-version: 2
-image_name: watsonx
-apis:
- agents
- datasetio
- eval
- files
- inference
- safety
- scoring
- tool_runtime
- vector_io
-providers:
-  inference:
-  - provider_id: watsonx
-    provider_type: remote::watsonx
-    config:
-      url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}
-      api_key: ${env.WATSONX_API_KEY:=}
-      project_id: ${env.WATSONX_PROJECT_ID:=}
-  vector_io:
-  - provider_id: faiss
-    provider_type: inline::faiss
-    config:
-      persistence:
-        namespace: vector_io::faiss
-        backend: kv_default
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence:
-        agent_state:
-          namespace: agents
-          backend: kv_default
-        responses:
-          table_name: responses
-          backend: sql_default
-          max_write_queue_size: 10000
-          num_writers: 4
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-  files:
-  - provider_id: meta-reference-files
-    provider_type: inline::localfs
-    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/watsonx/files}
-      metadata_store:
-        table_name: files_metadata
-        backend: sql_default
-storage:
-  backends:
-    kv_default:
-      type: kv_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/watsonx}/kvstore.db
-    sql_default:
-      type: sql_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/watsonx}/sql_store.db
-  stores:
-    metadata:
-      namespace: registry
-      backend: kv_default
-    inference:
-      table_name: inference_store
-      backend: sql_default
-      max_write_queue_size: 10000
-      num_writers: 4
-    conversations:
-      table_name: openai_conversations
-      backend: sql_default
-    prompts:
-      namespace: prompts
-      backend: kv_default
-registered_resources:
-  models: []
-  shields: []
-  vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
-  tool_groups:
-  - toolgroup_id: builtin::websearch
-    provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
-server:
-  port: 8321
-telemetry:
-  enabled: true
--- a/pyproject.toml
+++ b/pyproject.toml
@ -7,7 +7,7 @@ required-version = ">=0.7.0"

 [project]
 name = "llama_stack"
-version = "0.3.0"
+version = "0.4.0.dev0"
 authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
 description = "Llama Stack"
 readme = "README.md"
@ -71,11 +71,40 @@ dev = [
    "nbval", # For notebook testing
    "black",
    "ruff",
-    "types-requests",
-    "types-setuptools",
+    "mypy",
    "pre-commit",
    "ruamel.yaml", # needed for openapi generator
 ]
+# Type checking dependencies - includes type stubs and optional runtime dependencies
+# needed for complete mypy coverage across all optional features
+type_checking = [
+    "types-requests",
+    "types-setuptools",
+    "types-jsonschema",
+    "pandas-stubs",
+    "types-psutil",
+    "types-tqdm",
+    "boto3-stubs[s3]",
+    "streamlit",
+    "streamlit-option-menu",
+    "pandas",
+    "anthropic",
+    "databricks-sdk",
+    "fairscale",
+    "torchtune",
+    "trl",
+    "peft",
+    "datasets",
+    "together",
+    "nest-asyncio",
+    "pymongo",
+    "torchvision",
+    "sqlite-vec",
+    "faiss-cpu",
+    "lm-format-enforcer",
+    "mcp",
+    "ollama",
+]
 # These are the dependencies required for running unit tests.
 unit = [
    "anthropic",
@ -255,7 +284,6 @@ exclude = [
    "^src/llama_stack/models/llama/llama3/interface\\.py$",
    "^src/llama_stack/models/llama/llama3/tokenizer\\.py$",
    "^src/llama_stack/models/llama/llama3/tool_utils\\.py$",
-    "^src/llama_stack/providers/inline/agents/meta_reference/",
    "^src/llama_stack/providers/inline/datasetio/localfs/",
    "^src/llama_stack/providers/inline/eval/meta_reference/eval\\.py$",
    "^src/llama_stack/providers/inline/inference/meta_reference/inference\\.py$",
@ -316,7 +344,17 @@ exclude = [

 [[tool.mypy.overrides]]
 # packages that lack typing annotations, do not have stubs, or are unavailable.
-module = ["yaml", "fire"]
+module = [
+    "yaml",
+    "fire",
+    "torchtune.*",
+    "fairscale.*",
+    "torchvision.*",
+    "datasets",
+    "nest_asyncio",
+    "streamlit_option_menu",
+    "lmformatenforcer.*",
+]
 ignore_missing_imports = true

 [tool.pydantic-mypy]
--- a/scripts/distro_codegen.py
+++ b/scripts/distro_codegen.py
@ -55,7 +55,7 @@ def process_distro(distro_dir: Path, progress, change_tracker: ChangedPathTracke
        if template_func := getattr(module, "get_distribution_template", None):
            distro = template_func()

-            yaml_output_dir = REPO_ROOT / "llama_stack" / "distributions" / distro.name
+            yaml_output_dir = REPO_ROOT / "src" / "llama_stack" / "distributions" / distro.name
            doc_output_dir = REPO_ROOT / "docs/docs/distributions" / f"{distro.distro_type}_distro"
            change_tracker.add_paths(yaml_output_dir, doc_output_dir)
            distro.save_distribution(
--- a/scripts/docker.sh
+++ b/scripts/docker.sh
@ -215,6 +215,16 @@ build_image() {
        --build-arg "LLAMA_STACK_DIR=/workspace"
    )

+    # Pass UV index configuration for release branches
+    if [[ -n "${UV_EXTRA_INDEX_URL:-}" ]]; then
+        echo "Adding UV_EXTRA_INDEX_URL to docker build: $UV_EXTRA_INDEX_URL"
+        build_cmd+=(--build-arg "UV_EXTRA_INDEX_URL=$UV_EXTRA_INDEX_URL")
+    fi
+    if [[ -n "${UV_INDEX_STRATEGY:-}" ]]; then
+        echo "Adding UV_INDEX_STRATEGY to docker build: $UV_INDEX_STRATEGY"
+        build_cmd+=(--build-arg "UV_INDEX_STRATEGY=$UV_INDEX_STRATEGY")
+    fi
+
    if ! "${build_cmd[@]}"; then
        echo "❌ Failed to build Docker image"
        exit 1
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@ -23,7 +23,7 @@ COLLECT_ONLY=false

 # Function to display usage
 usage() {
-    cat << EOF
+    cat <<EOF
 Usage: $0 [OPTIONS]

 Options:
@ -102,7 +102,6 @@ while [[ $# -gt 0 ]]; do
    esac
 done

-
 # Validate required parameters
 if [[ -z "$STACK_CONFIG" && "$COLLECT_ONLY" == false ]]; then
    echo "Error: --stack-config is required"
@ -177,21 +176,45 @@ cd $ROOT_DIR
 # check if "llama" and "pytest" are available. this script does not use `uv run` given
 # it can be used in a pre-release environment where we have not been able to tell
 # uv about pre-release dependencies properly (yet).
-if [[ "$COLLECT_ONLY" == false ]] && ! command -v llama &> /dev/null; then
+if [[ "$COLLECT_ONLY" == false ]] && ! command -v llama &>/dev/null; then
    echo "llama could not be found, ensure llama-stack is installed"
    exit 1
 fi

-if ! command -v pytest &> /dev/null; then
+if ! command -v pytest &>/dev/null; then
    echo "pytest could not be found, ensure pytest is installed"
    exit 1
 fi

+# Helper function to find next available port
+find_available_port() {
+    local start_port=$1
+    local port=$start_port
+    for ((i=0; i<100; i++)); do
+        if ! lsof -Pi :$port -sTCP:LISTEN -t >/dev/null 2>&1; then
+            echo $port
+            return 0
+        fi
+        ((port++))
+    done
+    echo "Failed to find available port starting from $start_port" >&2
+    return 1
+}
+
 # Start Llama Stack Server if needed
 if [[ "$STACK_CONFIG" == *"server:"* && "$COLLECT_ONLY" == false ]]; then
+    # Find an available port for the server
+    LLAMA_STACK_PORT=$(find_available_port 8321)
+    if [[ $? -ne 0 ]]; then
+        echo "Error: $LLAMA_STACK_PORT"
+        exit 1
+    fi
+    export LLAMA_STACK_PORT
+    echo "Will use port: $LLAMA_STACK_PORT"
+
    stop_server() {
        echo "Stopping Llama Stack Server..."
-        pids=$(lsof -i :8321 | awk 'NR>1 {print $2}')
+        pids=$(lsof -i :$LLAMA_STACK_PORT | awk 'NR>1 {print $2}')
        if [[ -n "$pids" ]]; then
            echo "Killing Llama Stack Server processes: $pids"
            kill -9 $pids
@ -201,20 +224,25 @@ if [[ "$STACK_CONFIG" == *"server:"* && "$COLLECT_ONLY" == false ]]; then
        echo "Llama Stack Server stopped"
    }

-    # check if server is already running
-    if curl -s http://localhost:8321/v1/health 2>/dev/null | grep -q "OK"; then
-        echo "Llama Stack Server is already running, skipping start"
-    else
    echo "=== Starting Llama Stack Server ==="
    export LLAMA_STACK_LOG_WIDTH=120

+    # Configure telemetry collector for server mode
+    # Use a fixed port for the OTEL collector so the server can connect to it
+    COLLECTOR_PORT=4317
+    export LLAMA_STACK_TEST_COLLECTOR_PORT="${COLLECTOR_PORT}"
+    export OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:${COLLECTOR_PORT}"
+    export OTEL_EXPORTER_OTLP_PROTOCOL="http/protobuf"
+    export OTEL_BSP_SCHEDULE_DELAY="200"
+    export OTEL_BSP_EXPORT_TIMEOUT="2000"
+
    # remove "server:" from STACK_CONFIG
    stack_config=$(echo "$STACK_CONFIG" | sed 's/^server://')
-        nohup llama stack run $stack_config > server.log 2>&1 &
+    nohup llama stack run $stack_config >server.log 2>&1 &

-        echo "Waiting for Llama Stack Server to start..."
+    echo "Waiting for Llama Stack Server to start on port $LLAMA_STACK_PORT..."
    for i in {1..30}; do
-            if curl -s http://localhost:8321/v1/health 2>/dev/null | grep -q "OK"; then
+        if curl -s http://localhost:$LLAMA_STACK_PORT/v1/health 2>/dev/null | grep -q "OK"; then
            echo "✅ Llama Stack Server started successfully"
            break
        fi
@ -227,7 +255,6 @@ if [[ "$STACK_CONFIG" == *"server:"* && "$COLLECT_ONLY" == false ]]; then
        sleep 1
    done
    echo ""
-    fi

    trap stop_server EXIT ERR INT TERM
 fi
@ -239,7 +266,7 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
        container_name="llama-stack-test-$DISTRO"
        if docker ps -a --format '{{.Names}}' | grep -q "^${container_name}$"; then
            echo "Dumping container logs before stopping..."
-            docker logs "$container_name" > "docker-${DISTRO}-${INFERENCE_MODE}.log" 2>&1 || true
+            docker logs "$container_name" >"docker-${DISTRO}-${INFERENCE_MODE}.log" 2>&1 || true
            echo "Stopping and removing container: $container_name"
            docker stop "$container_name" 2>/dev/null || true
            docker rm "$container_name" 2>/dev/null || true
@ -251,7 +278,14 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then

    # Extract distribution name from docker:distro format
    DISTRO=$(echo "$STACK_CONFIG" | sed 's/^docker://')
-    export LLAMA_STACK_PORT=8321
+    # Find an available port for the docker container
+    LLAMA_STACK_PORT=$(find_available_port 8321)
+    if [[ $? -ne 0 ]]; then
+        echo "Error: $LLAMA_STACK_PORT"
+        exit 1
+    fi
+    export LLAMA_STACK_PORT
+    echo "Will use port: $LLAMA_STACK_PORT"

    echo "=== Building Docker Image for distribution: $DISTRO ==="
    containerfile="$ROOT_DIR/containers/Containerfile"
@ -271,6 +305,16 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
        --build-arg "LLAMA_STACK_DIR=/workspace"
    )

+    # Pass UV index configuration for release branches
+    if [[ -n "${UV_EXTRA_INDEX_URL:-}" ]]; then
+        echo "Adding UV_EXTRA_INDEX_URL to docker build: $UV_EXTRA_INDEX_URL"
+        build_cmd+=(--build-arg "UV_EXTRA_INDEX_URL=$UV_EXTRA_INDEX_URL")
+    fi
+    if [[ -n "${UV_INDEX_STRATEGY:-}" ]]; then
+        echo "Adding UV_INDEX_STRATEGY to docker build: $UV_INDEX_STRATEGY"
+        build_cmd+=(--build-arg "UV_INDEX_STRATEGY=$UV_INDEX_STRATEGY")
+    fi
+
    if ! "${build_cmd[@]}"; then
        echo "❌ Failed to build Docker image"
        exit 1
@ -284,10 +328,15 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
    docker stop "$container_name" 2>/dev/null || true
    docker rm "$container_name" 2>/dev/null || true

+    # Configure telemetry collector port shared between host and container
+    COLLECTOR_PORT=4317
+    export LLAMA_STACK_TEST_COLLECTOR_PORT="${COLLECTOR_PORT}"
+
    # Build environment variables for docker run
    DOCKER_ENV_VARS=""
    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_INFERENCE_MODE=$INFERENCE_MODE"
    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_STACK_CONFIG_TYPE=server"
+    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:${COLLECTOR_PORT}"

    # Pass through API keys if they exist
    [ -n "${TOGETHER_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e TOGETHER_API_KEY=$TOGETHER_API_KEY"
@ -308,8 +357,20 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
    fi
    echo "Using image: $IMAGE_NAME"

-    docker run -d --network host --name "$container_name" \
-        -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+    # On macOS/Darwin, --network host doesn't work as expected due to Docker running in a VM
+    # Use regular port mapping instead
+    NETWORK_MODE=""
+    PORT_MAPPINGS=""
+    if [[ "$(uname)" != "Darwin" ]] && [[ "$(uname)" != *"MINGW"* ]]; then
+        NETWORK_MODE="--network host"
+    else
+        # On non-Linux (macOS, Windows), need explicit port mappings for both app and telemetry
+        PORT_MAPPINGS="-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT -p $COLLECTOR_PORT:$COLLECTOR_PORT"
+        echo "Using bridge networking with port mapping (non-Linux)"
+    fi
+
+    docker run -d $NETWORK_MODE --name "$container_name" \
+        $PORT_MAPPINGS \
        $DOCKER_ENV_VARS \
        "$IMAGE_NAME" \
        --port $LLAMA_STACK_PORT
@ -411,17 +472,13 @@ elif [ $exit_code -eq 5 ]; then
 else
    echo "❌ Tests failed"
    echo ""
-    echo "=== Dumping last 100 lines of logs for debugging ==="
-
    # Output server or container logs based on stack config
    if [[ "$STACK_CONFIG" == *"server:"* && -f "server.log" ]]; then
-        echo "--- Last 100 lines of server.log ---"
-        tail -100 server.log
+        echo "--- Server side failures can be located inside server.log (available from artifacts on CI) ---"
    elif [[ "$STACK_CONFIG" == *"docker:"* ]]; then
        docker_log_file="docker-${DISTRO}-${INFERENCE_MODE}.log"
        if [[ -f "$docker_log_file" ]]; then
-            echo "--- Last 100 lines of $docker_log_file ---"
-            tail -100 "$docker_log_file"
+            echo "--- Server side failures can be located inside $docker_log_file (available from artifacts on CI) ---"
        fi
    fi

--- a/scripts/uv-run-with-index.sh
+++ b/scripts/uv-run-with-index.sh
@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+set -euo pipefail
+
+# Detect current branch and target branch
+# In GitHub Actions, use GITHUB_REF/GITHUB_BASE_REF
+if [[ -n "${GITHUB_REF:-}" ]]; then
+  BRANCH="${GITHUB_REF#refs/heads/}"
+else
+  BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "")
+fi
+
+# For PRs, check the target branch
+if [[ -n "${GITHUB_BASE_REF:-}" ]]; then
+  TARGET_BRANCH="${GITHUB_BASE_REF}"
+else
+  TARGET_BRANCH=$(git rev-parse --abbrev-ref HEAD@{upstream} 2>/dev/null | sed 's|origin/||' || echo "")
+fi
+
+# Check if on a release branch or targeting one, or LLAMA_STACK_RELEASE_MODE is set
+IS_RELEASE=false
+if [[ "$BRANCH" =~ ^release-[0-9]+\.[0-9]+\.x$ ]]; then
+  IS_RELEASE=true
+elif [[ "$TARGET_BRANCH" =~ ^release-[0-9]+\.[0-9]+\.x$ ]]; then
+  IS_RELEASE=true
+elif [[ "${LLAMA_STACK_RELEASE_MODE:-}" == "true" ]]; then
+  IS_RELEASE=true
+fi
+
+# On release branches, use test.pypi as extra index for RC versions
+if [[ "$IS_RELEASE" == "true" ]]; then
+  export UV_EXTRA_INDEX_URL="https://test.pypi.org/simple/"
+  export UV_INDEX_STRATEGY="unsafe-best-match"
+fi
+
+# Run uv with all arguments passed through
+exec uv "$@"
--- a/src/llama_stack/apis/agents/agents.py
+++ b/src/llama_stack/apis/agents/agents.py
@ -38,6 +38,7 @@ from .openai_responses import (
    OpenAIResponseInputTool,
    OpenAIResponseObject,
    OpenAIResponseObjectStream,
+    OpenAIResponsePrompt,
    OpenAIResponseText,
 )

@ -490,13 +491,6 @@ class Agents(Protocol):

    APIs for creating and interacting with agentic systems."""

-    @webmethod(
-        route="/agents",
-        method="POST",
-        descriptive_name="create_agent",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(
        route="/agents",
        method="POST",
@ -514,13 +508,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}/turn",
-        method="POST",
-        descriptive_name="create_agent_turn",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(
        route="/agents/{agent_id}/session/{session_id}/turn",
        method="POST",
@ -551,13 +538,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume",
-        method="POST",
-        descriptive_name="resume_agent_turn",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(
        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume",
        method="POST",
@ -585,12 +565,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}",
-        method="GET",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(
        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}",
        method="GET",
@ -611,12 +585,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}",
-        method="GET",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(
        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}",
        method="GET",
@ -639,13 +607,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}/session",
-        method="POST",
-        descriptive_name="create_agent_session",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(
        route="/agents/{agent_id}/session",
        method="POST",
@ -665,12 +626,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}",
-        method="GET",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(
        route="/agents/{agent_id}/session/{session_id}",
        method="GET",
@ -691,12 +646,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}",
-        method="DELETE",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(
        route="/agents/{agent_id}/session/{session_id}",
        method="DELETE",
@ -714,12 +663,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}",
-        method="DELETE",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(route="/agents/{agent_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA)
    async def delete_agent(
        self,
@ -731,7 +674,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/agents", method="GET", deprecated=True, level=LLAMA_STACK_API_V1)
    @webmethod(route="/agents", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def list_agents(self, start_index: int | None = None, limit: int | None = None) -> PaginatedResponse:
        """List all agents.
@ -742,12 +684,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}",
-        method="GET",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(route="/agents/{agent_id}", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def get_agent(self, agent_id: str) -> Agent:
        """Describe an agent by its ID.
@ -757,12 +693,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}/sessions",
-        method="GET",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(route="/agents/{agent_id}/sessions", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def list_agent_sessions(
        self,
@ -786,12 +716,6 @@ class Agents(Protocol):
    #
    # Both of these APIs are inherently stateful.

-    @webmethod(
-        route="/openai/v1/responses/{response_id}",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(route="/responses/{response_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_openai_response(
        self,
@ -804,12 +728,12 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/responses", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/responses", method="POST", level=LLAMA_STACK_API_V1)
    async def create_openai_response(
        self,
        input: str | list[OpenAIResponseInput],
        model: str,
+        prompt: OpenAIResponsePrompt | None = None,
        instructions: str | None = None,
        previous_response_id: str | None = None,
        conversation: str | None = None,
@ -831,6 +755,7 @@ class Agents(Protocol):

        :param input: Input message(s) to create the response.
        :param model: The underlying LLM used for completions.
+        :param prompt: (Optional) Prompt object with ID, version, and variables.
        :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
        :param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation.
        :param include: (Optional) Additional fields to include in the response.
@ -839,7 +764,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/responses", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/responses", method="GET", level=LLAMA_STACK_API_V1)
    async def list_openai_responses(
        self,
@ -858,9 +782,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/responses/{response_id}/input_items", method="GET", level=LLAMA_STACK_API_V1, deprecated=True
-    )
    @webmethod(route="/responses/{response_id}/input_items", method="GET", level=LLAMA_STACK_API_V1)
    async def list_openai_response_input_items(
        self,
@ -883,7 +804,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
        """Delete a response.
--- a/src/llama_stack/apis/agents/openai_responses.py
+++ b/src/llama_stack/apis/agents/openai_responses.py
@ -4,9 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from collections.abc import Sequence
 from typing import Annotated, Any, Literal

-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, model_validator
 from typing_extensions import TypedDict

 from llama_stack.apis.vector_io import SearchRankingOptions as FileSearchRankingOptions
@ -46,23 +47,66 @@ class OpenAIResponseInputMessageContentImage(BaseModel):

    :param detail: Level of detail for image processing, can be "low", "high", or "auto"
    :param type: Content type identifier, always "input_image"
+    :param file_id: (Optional) The ID of the file to be sent to the model.
    :param image_url: (Optional) URL of the image content
    """

    detail: Literal["low"] | Literal["high"] | Literal["auto"] = "auto"
    type: Literal["input_image"] = "input_image"
-    # TODO: handle file_id
+    file_id: str | None = None
    image_url: str | None = None


-# TODO: handle file content types
+@json_schema_type
+class OpenAIResponseInputMessageContentFile(BaseModel):
+    """File content for input messages in OpenAI response format.
+
+    :param type: The type of the input item. Always `input_file`.
+    :param file_data: The data of the file to be sent to the model.
+    :param file_id: (Optional) The ID of the file to be sent to the model.
+    :param file_url: The URL of the file to be sent to the model.
+    :param filename: The name of the file to be sent to the model.
+    """
+
+    type: Literal["input_file"] = "input_file"
+    file_data: str | None = None
+    file_id: str | None = None
+    file_url: str | None = None
+    filename: str | None = None
+
+    @model_validator(mode="after")
+    def validate_file_source(self) -> "OpenAIResponseInputMessageContentFile":
+        if not any([self.file_data, self.file_id, self.file_url, self.filename]):
+            raise ValueError(
+                "At least one of 'file_data', 'file_id', 'file_url', or 'filename' must be provided for file content"
+            )
+        return self
+
+
 OpenAIResponseInputMessageContent = Annotated[
-    OpenAIResponseInputMessageContentText | OpenAIResponseInputMessageContentImage,
+    OpenAIResponseInputMessageContentText
+    | OpenAIResponseInputMessageContentImage
+    | OpenAIResponseInputMessageContentFile,
    Field(discriminator="type"),
 ]
 register_schema(OpenAIResponseInputMessageContent, name="OpenAIResponseInputMessageContent")


+@json_schema_type
+class OpenAIResponsePrompt(BaseModel):
+    """OpenAI compatible Prompt object that is used in OpenAI responses.
+
+    :param id: Unique identifier of the prompt template
+    :param variables: Dictionary of variable names to OpenAIResponseInputMessageContent structure for template substitution. The substitution values can either be strings, or other Response input types
+    like images or files.
+    :param version: Version number of the prompt to use (defaults to latest if not specified)
+    """
+
+    id: str
+    variables: dict[str, OpenAIResponseInputMessageContent] | None = None
+    version: str | None = None
+
+
@json_schema_type
 class OpenAIResponseAnnotationFileCitation(BaseModel):
    """File citation annotation for referencing specific files in response content.
@ -159,7 +203,7 @@ class OpenAIResponseMessage(BaseModel):
    scenarios.
    """

-    content: str | list[OpenAIResponseInputMessageContent] | list[OpenAIResponseOutputMessageContent]
+    content: str | Sequence[OpenAIResponseInputMessageContent] | Sequence[OpenAIResponseOutputMessageContent]
    role: Literal["system"] | Literal["developer"] | Literal["user"] | Literal["assistant"]
    type: Literal["message"] = "message"

@ -211,10 +255,10 @@ class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
    """

    id: str
-    queries: list[str]
+    queries: Sequence[str]
    status: str
    type: Literal["file_search_call"] = "file_search_call"
-    results: list[OpenAIResponseOutputMessageFileSearchToolCallResults] | None = None
+    results: Sequence[OpenAIResponseOutputMessageFileSearchToolCallResults] | None = None


@json_schema_type
@ -538,6 +582,7 @@ class OpenAIResponseObject(BaseModel):
    :param output: List of generated output items (messages, tool calls, etc.)
    :param parallel_tool_calls: Whether tool calls can be executed in parallel
    :param previous_response_id: (Optional) ID of the previous response in a conversation
+    :param prompt: (Optional) Reference to a prompt template and its variables.
    :param status: Current status of the response generation
    :param temperature: (Optional) Sampling temperature used for generation
    :param text: Text formatting configuration for the response
@ -553,16 +598,17 @@ class OpenAIResponseObject(BaseModel):
    id: str
    model: str
    object: Literal["response"] = "response"
-    output: list[OpenAIResponseOutput]
+    output: Sequence[OpenAIResponseOutput]
    parallel_tool_calls: bool = False
    previous_response_id: str | None = None
+    prompt: OpenAIResponsePrompt | None = None
    status: str
    temperature: float | None = None
    # Default to text format to avoid breaking the loading of old responses
    # before the field was added. New responses will have this set always.
    text: OpenAIResponseText = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text"))
    top_p: float | None = None
-    tools: list[OpenAIResponseTool] | None = None
+    tools: Sequence[OpenAIResponseTool] | None = None
    truncation: str | None = None
    usage: OpenAIResponseUsage | None = None
    instructions: str | None = None
@ -1270,7 +1316,7 @@ class ListOpenAIResponseInputItem(BaseModel):
    :param object: Object type identifier, always "list"
    """

-    data: list[OpenAIResponseInput]
+    data: Sequence[OpenAIResponseInput]
    object: Literal["list"] = "list"


@ -1281,7 +1327,7 @@ class OpenAIResponseObjectWithInput(OpenAIResponseObject):
    :param input: List of input items that led to this response
    """

-    input: list[OpenAIResponseInput]
+    input: Sequence[OpenAIResponseInput]

    def to_response_object(self) -> OpenAIResponseObject:
        """Convert to OpenAIResponseObject by excluding input field."""
@ -1299,7 +1345,7 @@ class ListOpenAIResponseObject(BaseModel):
    :param object: Object type identifier, always "list"
    """

-    data: list[OpenAIResponseObjectWithInput]
+    data: Sequence[OpenAIResponseObjectWithInput]
    has_more: bool
    first_id: str
    last_id: str
--- a/src/llama_stack/apis/batches/batches.py
+++ b/src/llama_stack/apis/batches/batches.py
@ -43,7 +43,6 @@ class Batches(Protocol):
    Note: This API is currently under active development and may undergo changes.
    """

-    @webmethod(route="/openai/v1/batches", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/batches", method="POST", level=LLAMA_STACK_API_V1)
    async def create_batch(
        self,
@ -64,7 +63,6 @@ class Batches(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/batches/{batch_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/batches/{batch_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def retrieve_batch(self, batch_id: str) -> BatchObject:
        """Retrieve information about a specific batch.
@ -74,7 +72,6 @@ class Batches(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/batches/{batch_id}/cancel", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/batches/{batch_id}/cancel", method="POST", level=LLAMA_STACK_API_V1)
    async def cancel_batch(self, batch_id: str) -> BatchObject:
        """Cancel a batch that is in progress.
@ -84,7 +81,6 @@ class Batches(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/batches", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/batches", method="GET", level=LLAMA_STACK_API_V1)
    async def list_batches(
        self,
--- a/src/llama_stack/apis/benchmarks/benchmarks.py
+++ b/src/llama_stack/apis/benchmarks/benchmarks.py
@ -8,7 +8,7 @@ from typing import Any, Literal, Protocol, runtime_checkable
 from pydantic import BaseModel, Field

 from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
+from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -54,7 +54,6 @@ class ListBenchmarksResponse(BaseModel):

@runtime_checkable
 class Benchmarks(Protocol):
-    @webmethod(route="/eval/benchmarks", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/eval/benchmarks", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def list_benchmarks(self) -> ListBenchmarksResponse:
        """List all benchmarks.
@ -63,7 +62,6 @@ class Benchmarks(Protocol):
        """
        ...

-    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def get_benchmark(
        self,
@ -76,7 +74,6 @@ class Benchmarks(Protocol):
        """
        ...

-    @webmethod(route="/eval/benchmarks", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/eval/benchmarks", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def register_benchmark(
        self,
@ -98,7 +95,6 @@ class Benchmarks(Protocol):
        """
        ...

-    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA)
    async def unregister_benchmark(self, benchmark_id: str) -> None:
        """Unregister a benchmark.
--- a/src/llama_stack/apis/datasetio/datasetio.py
+++ b/src/llama_stack/apis/datasetio/datasetio.py
@ -8,7 +8,7 @@ from typing import Any, Protocol, runtime_checkable

 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.datasets import Dataset
-from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1BETA
+from llama_stack.apis.version import LLAMA_STACK_API_V1BETA
 from llama_stack.schema_utils import webmethod


@ -21,7 +21,6 @@ class DatasetIO(Protocol):
    # keeping for aligning with inference/safety, but this is not used
    dataset_store: DatasetStore

-    @webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET", deprecated=True, level=LLAMA_STACK_API_V1)
    @webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET", level=LLAMA_STACK_API_V1BETA)
    async def iterrows(
        self,
@ -46,9 +45,6 @@ class DatasetIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/datasetio/append-rows/{dataset_id:path}", method="POST", deprecated=True, level=LLAMA_STACK_API_V1
-    )
    @webmethod(route="/datasetio/append-rows/{dataset_id:path}", method="POST", level=LLAMA_STACK_API_V1BETA)
    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
        """Append rows to a dataset.
--- a/src/llama_stack/apis/datasets/datasets.py
+++ b/src/llama_stack/apis/datasets/datasets.py
@ -10,7 +10,7 @@ from typing import Annotated, Any, Literal, Protocol
 from pydantic import BaseModel, Field

 from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1BETA
+from llama_stack.apis.version import LLAMA_STACK_API_V1BETA
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod


@ -146,7 +146,6 @@ class ListDatasetsResponse(BaseModel):


 class Datasets(Protocol):
-    @webmethod(route="/datasets", method="POST", deprecated=True, level=LLAMA_STACK_API_V1)
    @webmethod(route="/datasets", method="POST", level=LLAMA_STACK_API_V1BETA)
    async def register_dataset(
        self,
@ -216,7 +215,6 @@ class Datasets(Protocol):
        """
        ...

-    @webmethod(route="/datasets/{dataset_id:path}", method="GET", deprecated=True, level=LLAMA_STACK_API_V1)
    @webmethod(route="/datasets/{dataset_id:path}", method="GET", level=LLAMA_STACK_API_V1BETA)
    async def get_dataset(
        self,
@ -229,7 +227,6 @@ class Datasets(Protocol):
        """
        ...

-    @webmethod(route="/datasets", method="GET", deprecated=True, level=LLAMA_STACK_API_V1)
    @webmethod(route="/datasets", method="GET", level=LLAMA_STACK_API_V1BETA)
    async def list_datasets(self) -> ListDatasetsResponse:
        """List all datasets.
@ -238,7 +235,6 @@ class Datasets(Protocol):
        """
        ...

-    @webmethod(route="/datasets/{dataset_id:path}", method="DELETE", deprecated=True, level=LLAMA_STACK_API_V1)
    @webmethod(route="/datasets/{dataset_id:path}", method="DELETE", level=LLAMA_STACK_API_V1BETA)
    async def unregister_dataset(
        self,
--- a/src/llama_stack/apis/eval/eval.py
+++ b/src/llama_stack/apis/eval/eval.py
@ -13,7 +13,7 @@ from llama_stack.apis.common.job_types import Job
 from llama_stack.apis.inference import SamplingParams, SystemMessage
 from llama_stack.apis.scoring import ScoringResult
 from llama_stack.apis.scoring_functions import ScoringFnParams
-from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
+from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod


@ -86,7 +86,6 @@ class Eval(Protocol):

    Llama Stack Evaluation API for running evaluations on model and agent candidates."""

-    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def run_eval(
        self,
@ -101,9 +100,6 @@ class Eval(Protocol):
        """
        ...

-    @webmethod(
-        route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST", level=LLAMA_STACK_API_V1, deprecated=True
-    )
    @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def evaluate_rows(
        self,
@ -122,9 +118,6 @@ class Eval(Protocol):
        """
        ...

-    @webmethod(
-        route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True
-    )
    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def job_status(self, benchmark_id: str, job_id: str) -> Job:
        """Get the status of a job.
@ -135,12 +128,6 @@ class Eval(Protocol):
        """
        ...

-    @webmethod(
-        route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}",
-        method="DELETE",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA)
    async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
        """Cancel a job.
@ -150,12 +137,6 @@ class Eval(Protocol):
        """
        ...

-    @webmethod(
-        route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET", level=LLAMA_STACK_API_V1ALPHA
    )
--- a/src/llama_stack/apis/files/files.py
+++ b/src/llama_stack/apis/files/files.py
@ -110,7 +110,6 @@ class Files(Protocol):
    """

    # OpenAI Files API Endpoints
-    @webmethod(route="/openai/v1/files", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/files", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_upload_file(
        self,
@ -134,7 +133,6 @@ class Files(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/files", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/files", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_list_files(
        self,
@ -155,7 +153,6 @@ class Files(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/files/{file_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/files/{file_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_retrieve_file(
        self,
@ -170,7 +167,6 @@ class Files(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/files/{file_id}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/files/{file_id}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def openai_delete_file(
        self,
@ -183,7 +179,6 @@ class Files(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/files/{file_id}/content", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/files/{file_id}/content", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_retrieve_file_content(
        self,
--- a/src/llama_stack/apis/inference/inference.py
+++ b/src/llama_stack/apis/inference/inference.py
@ -1189,7 +1189,6 @@ class InferenceProvider(Protocol):
        raise NotImplementedError("Reranking is not implemented")
        return  # this is so mypy's safe-super rule will consider the method concrete

-    @webmethod(route="/openai/v1/completions", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/completions", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_completion(
        self,
@ -1202,7 +1201,6 @@ class InferenceProvider(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/chat/completions", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/chat/completions", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_chat_completion(
        self,
@ -1215,7 +1213,6 @@ class InferenceProvider(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/embeddings", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/embeddings", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_embeddings(
        self,
@ -1240,7 +1237,6 @@ class Inference(InferenceProvider):
    - Rerank models: these models reorder the documents based on their relevance to a query.
    """

-    @webmethod(route="/openai/v1/chat/completions", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/chat/completions", method="GET", level=LLAMA_STACK_API_V1)
    async def list_chat_completions(
        self,
@ -1259,9 +1255,6 @@ class Inference(InferenceProvider):
        """
        raise NotImplementedError("List chat completions is not implemented")

-    @webmethod(
-        route="/openai/v1/chat/completions/{completion_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True
-    )
    @webmethod(route="/chat/completions/{completion_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
        """Get chat completion.
--- a/src/llama_stack/apis/inspect/inspect.py
+++ b/src/llama_stack/apis/inspect/inspect.py
@ -4,14 +4,21 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Protocol, runtime_checkable
+from typing import Literal, Protocol, runtime_checkable

 from pydantic import BaseModel

-from llama_stack.apis.version import LLAMA_STACK_API_V1
+from llama_stack.apis.version import (
+    LLAMA_STACK_API_V1,
+)
 from llama_stack.providers.datatypes import HealthStatus
 from llama_stack.schema_utils import json_schema_type, webmethod

+# Valid values for the route filter parameter.
+# Actual API levels: v1, v1alpha, v1beta (filters by level, excludes deprecated)
+# Special filter value: "deprecated" (shows deprecated routes regardless of level)
+ApiFilter = Literal["v1", "v1alpha", "v1beta", "deprecated"]
+

@json_schema_type
 class RouteInfo(BaseModel):
@ -64,11 +71,12 @@ class Inspect(Protocol):
    """

    @webmethod(route="/inspect/routes", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_routes(self) -> ListRoutesResponse:
+    async def list_routes(self, api_filter: ApiFilter | None = None) -> ListRoutesResponse:
        """List routes.

        List all available API routes with their methods and implementing providers.

+        :param api_filter: Optional filter to control which routes are returned. Can be an API level ('v1', 'v1alpha', 'v1beta') to show non-deprecated routes at that level, or 'deprecated' to show deprecated routes across all levels. If not specified, returns only non-deprecated v1 routes.
        :returns: Response containing information about all available routes.
        """
        ...
--- a/src/llama_stack/apis/models/models.py
+++ b/src/llama_stack/apis/models/models.py
@ -90,12 +90,14 @@ class OpenAIModel(BaseModel):
    :object: The object type, which will be "model"
    :created: The Unix timestamp in seconds when the model was created
    :owned_by: The owner of the model
+    :custom_metadata: Llama Stack-specific metadata including model_type, provider info, and additional metadata
    """

    id: str
    object: Literal["model"] = "model"
    created: int
    owned_by: str
+    custom_metadata: dict[str, Any] | None = None


 class OpenAIListModelsResponse(BaseModel):
@ -105,7 +107,6 @@ class OpenAIListModelsResponse(BaseModel):
@runtime_checkable
@trace_protocol
 class Models(Protocol):
-    @webmethod(route="/models", method="GET", level=LLAMA_STACK_API_V1)
    async def list_models(self) -> ListModelsResponse:
        """List all models.

@ -113,7 +114,7 @@ class Models(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/models", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
+    @webmethod(route="/models", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_list_models(self) -> OpenAIListModelsResponse:
        """List models using the OpenAI API.

--- a/src/llama_stack/apis/post_training/post_training.py
+++ b/src/llama_stack/apis/post_training/post_training.py
@ -13,7 +13,7 @@ from pydantic import BaseModel, Field
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.common.job_types import JobStatus
 from llama_stack.apis.common.training_types import Checkpoint
-from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
+from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod


@ -284,7 +284,6 @@ class PostTrainingJobArtifactsResponse(BaseModel):


 class PostTraining(Protocol):
-    @webmethod(route="/post-training/supervised-fine-tune", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/post-training/supervised-fine-tune", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def supervised_fine_tune(
        self,
@ -312,7 +311,6 @@ class PostTraining(Protocol):
        """
        ...

-    @webmethod(route="/post-training/preference-optimize", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/post-training/preference-optimize", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def preference_optimize(
        self,
@ -335,7 +333,6 @@ class PostTraining(Protocol):
        """
        ...

-    @webmethod(route="/post-training/jobs", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/post-training/jobs", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def get_training_jobs(self) -> ListPostTrainingJobsResponse:
        """Get all training jobs.
@ -344,7 +341,6 @@ class PostTraining(Protocol):
        """
        ...

-    @webmethod(route="/post-training/job/status", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/post-training/job/status", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def get_training_job_status(self, job_uuid: str) -> PostTrainingJobStatusResponse:
        """Get the status of a training job.
@ -354,7 +350,6 @@ class PostTraining(Protocol):
        """
        ...

-    @webmethod(route="/post-training/job/cancel", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/post-training/job/cancel", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def cancel_training_job(self, job_uuid: str) -> None:
        """Cancel a training job.
@ -363,7 +358,6 @@ class PostTraining(Protocol):
        """
        ...

-    @webmethod(route="/post-training/job/artifacts", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/post-training/job/artifacts", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse:
        """Get the artifacts of a training job.
--- a/src/llama_stack/apis/safety/safety.py
+++ b/src/llama_stack/apis/safety/safety.py
@ -121,7 +121,6 @@ class Safety(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/moderations", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/moderations", method="POST", level=LLAMA_STACK_API_V1)
    async def run_moderation(self, input: str | list[str], model: str | None = None) -> ModerationObject:
        """Create moderation.
--- a/src/llama_stack/apis/synthetic_data_generation/init.py
+++ b/src/llama_stack/apis/synthetic_data_generation/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .synthetic_data_generation import *
--- a/src/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py
+++ b/src/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py
@ -1,77 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum
-from typing import Any, Protocol
-
-from pydantic import BaseModel
-
-from llama_stack.apis.inference import Message
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-class FilteringFunction(Enum):
-    """The type of filtering function.
-
-    :cvar none: No filtering applied, accept all generated synthetic data
-    :cvar random: Random sampling of generated data points
-    :cvar top_k: Keep only the top-k highest scoring synthetic data samples
-    :cvar top_p: Nucleus-style filtering, keep samples exceeding cumulative score threshold
-    :cvar top_k_top_p: Combined top-k and top-p filtering strategy
-    :cvar sigmoid: Apply sigmoid function for probability-based filtering
-    """
-
-    none = "none"
-    random = "random"
-    top_k = "top_k"
-    top_p = "top_p"
-    top_k_top_p = "top_k_top_p"
-    sigmoid = "sigmoid"
-
-
-@json_schema_type
-class SyntheticDataGenerationRequest(BaseModel):
-    """Request to generate synthetic data. A small batch of prompts and a filtering function
-
-    :param dialogs: List of conversation messages to use as input for synthetic data generation
-    :param filtering_function: Type of filtering to apply to generated synthetic data samples
-    :param model: (Optional) The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint
-    """
-
-    dialogs: list[Message]
-    filtering_function: FilteringFunction = FilteringFunction.none
-    model: str | None = None
-
-
-@json_schema_type
-class SyntheticDataGenerationResponse(BaseModel):
-    """Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold.
-
-    :param synthetic_data: List of generated synthetic data samples that passed the filtering criteria
-    :param statistics: (Optional) Statistical information about the generation process and filtering results
-    """
-
-    synthetic_data: list[dict[str, Any]]
-    statistics: dict[str, Any] | None = None
-
-
-class SyntheticDataGeneration(Protocol):
-    @webmethod(route="/synthetic-data-generation/generate", level=LLAMA_STACK_API_V1)
-    def synthetic_data_generate(
-        self,
-        dialogs: list[Message],
-        filtering_function: FilteringFunction = FilteringFunction.none,
-        model: str | None = None,
-    ) -> SyntheticDataGenerationResponse:
-        """Generate synthetic data based on input dialogs and apply filtering.
-
-        :param dialogs: List of conversation messages to use as input for synthetic data generation
-        :param filtering_function: Type of filtering to apply to generated synthetic data samples
-        :param model: (Optional) The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint
-        :returns: Response containing filtered synthetic data samples and optional statistics
-        """
-        ...
--- a/src/llama_stack/apis/vector_io/vector_io.py
+++ b/src/llama_stack/apis/vector_io/vector_io.py
@ -8,7 +8,6 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-import uuid
 from typing import Annotated, Any, Literal, Protocol, runtime_checkable

 from fastapi import Body
@ -18,7 +17,6 @@ from llama_stack.apis.inference import InterleavedContent
 from llama_stack.apis.vector_stores import VectorStore
 from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.core.telemetry.trace_protocol import trace_protocol
-from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
 from llama_stack.schema_utils import json_schema_type, webmethod
 from llama_stack.strong_typing.schema import register_schema

@ -61,38 +59,19 @@ class Chunk(BaseModel):
    """
    A chunk of content that can be inserted into a vector database.
    :param content: The content of the chunk, which can be interleaved text, images, or other types.
-    :param embedding: Optional embedding for the chunk. If not provided, it will be computed later.
+    :param chunk_id: Unique identifier for the chunk. Must be provided explicitly.
    :param metadata: Metadata associated with the chunk that will be used in the model context during inference.
-    :param stored_chunk_id: The chunk ID that is stored in the vector database. Used for backend functionality.
+    :param embedding: Optional embedding for the chunk. If not provided, it will be computed later.
    :param chunk_metadata: Metadata for the chunk that will NOT be used in the context during inference.
        The `chunk_metadata` is required backend functionality.
    """

    content: InterleavedContent
+    chunk_id: str
    metadata: dict[str, Any] = Field(default_factory=dict)
    embedding: list[float] | None = None
-    # The alias parameter serializes the field as "chunk_id" in JSON but keeps the internal name as "stored_chunk_id"
-    stored_chunk_id: str | None = Field(default=None, alias="chunk_id")
    chunk_metadata: ChunkMetadata | None = None

-    model_config = {"populate_by_name": True}
-
-    def model_post_init(self, __context):
-        # Extract chunk_id from metadata if present
-        if self.metadata and "chunk_id" in self.metadata:
-            self.stored_chunk_id = self.metadata.pop("chunk_id")
-
-    @property
-    def chunk_id(self) -> str:
-        """Returns the chunk ID, which is either an input `chunk_id` or a generated one if not set."""
-        if self.stored_chunk_id:
-            return self.stored_chunk_id
-
-        if "document_id" in self.metadata:
-            return generate_chunk_id(self.metadata["document_id"], str(self.content))
-
-        return generate_chunk_id(str(uuid.uuid4()), str(self.content))
-
    @property
    def document_id(self) -> str | None:
        """Returns the document_id from either metadata or chunk_metadata, with metadata taking precedence."""
@ -566,7 +545,6 @@ class VectorIO(Protocol):
        ...

    # OpenAI Vector Stores API endpoints
-    @webmethod(route="/openai/v1/vector_stores", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/vector_stores", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_create_vector_store(
        self,
@ -579,7 +557,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/vector_stores", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/vector_stores", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_list_vector_stores(
        self,
@ -598,9 +575,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True
-    )
    @webmethod(route="/vector_stores/{vector_store_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_retrieve_vector_store(
        self,
@ -613,9 +587,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}", method="POST", level=LLAMA_STACK_API_V1, deprecated=True
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}",
        method="POST",
@ -638,9 +609,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}",
        method="DELETE",
@ -657,12 +625,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/search",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/search",
        method="POST",
@ -695,12 +657,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/files",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/files",
        method="POST",
@ -723,12 +679,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/files",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/files",
        method="GET",
@ -755,12 +705,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/files/{file_id}",
        method="GET",
@ -779,12 +723,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}/content",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/files/{file_id}/content",
        method="GET",
@ -803,12 +741,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/files/{file_id}",
        method="POST",
@ -829,12 +761,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
-        method="DELETE",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/files/{file_id}",
        method="DELETE",
@ -858,12 +784,6 @@ class VectorIO(Protocol):
        method="POST",
        level=LLAMA_STACK_API_V1,
    )
-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/file_batches",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    async def openai_create_vector_store_file_batch(
        self,
        vector_store_id: str,
@ -882,12 +802,6 @@ class VectorIO(Protocol):
        method="GET",
        level=LLAMA_STACK_API_V1,
    )
-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/file_batches/{batch_id}",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    async def openai_retrieve_vector_store_file_batch(
        self,
        batch_id: str,
@ -901,12 +815,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/files",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/file_batches/{batch_id}/files",
        method="GET",
@ -935,12 +843,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel",
        method="POST",
--- a/src/llama_stack/cli/stack/run.py
+++ b/src/llama_stack/cli/stack/run.py
@ -8,16 +8,30 @@ import argparse
 import os
 import ssl
 import subprocess
+import sys
 from pathlib import Path

 import uvicorn
 import yaml
+from termcolor import cprint

 from llama_stack.cli.stack.utils import ImageType
 from llama_stack.cli.subcommand import Subcommand
-from llama_stack.core.datatypes import StackRunConfig
+from llama_stack.core.datatypes import Api, Provider, StackRunConfig
+from llama_stack.core.distribution import get_provider_registry
 from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars
+from llama_stack.core.storage.datatypes import (
+    InferenceStoreReference,
+    KVStoreReference,
+    ServerStoresConfig,
+    SqliteKVStoreConfig,
+    SqliteSqlStoreConfig,
+    SqlStoreReference,
+    StorageConfig,
+)
+from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR
 from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
+from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.log import LoggingConfig, get_logger

 REPO_ROOT = Path(__file__).parent.parent.parent.parent
@ -68,6 +82,12 @@ class StackRun(Subcommand):
            action="store_true",
            help="Start the UI server",
        )
+        self.parser.add_argument(
+            "--providers",
+            type=str,
+            default=None,
+            help="Run a stack with only a list of providers. This list is formatted like: api1=provider1,api1=provider2,api2=provider3. Where there can be multiple providers per API.",
+        )

    def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
        import yaml
@ -93,6 +113,55 @@ class StackRun(Subcommand):
                config_file = resolve_config_or_distro(args.config, Mode.RUN)
            except ValueError as e:
                self.parser.error(str(e))
+        elif args.providers:
+            provider_list: dict[str, list[Provider]] = dict()
+            for api_provider in args.providers.split(","):
+                if "=" not in api_provider:
+                    cprint(
+                        "Could not parse `--providers`. Please ensure the list is in the format api1=provider1,api2=provider2",
+                        color="red",
+                        file=sys.stderr,
+                    )
+                    sys.exit(1)
+                api, provider_type = api_provider.split("=")
+                providers_for_api = get_provider_registry().get(Api(api), None)
+                if providers_for_api is None:
+                    cprint(
+                        f"{api} is not a valid API.",
+                        color="red",
+                        file=sys.stderr,
+                    )
+                    sys.exit(1)
+                if provider_type in providers_for_api:
+                    config_type = instantiate_class_type(providers_for_api[provider_type].config_class)
+                    if config_type is not None and hasattr(config_type, "sample_run_config"):
+                        config = config_type.sample_run_config(__distro_dir__="~/.llama/distributions/providers-run")
+                    else:
+                        config = {}
+                    provider = Provider(
+                        provider_type=provider_type,
+                        config=config,
+                        provider_id=provider_type.split("::")[1],
+                    )
+                    provider_list.setdefault(api, []).append(provider)
+                else:
+                    cprint(
+                        f"{provider} is not a valid provider for the {api} API.",
+                        color="red",
+                        file=sys.stderr,
+                    )
+                    sys.exit(1)
+            run_config = self._generate_run_config_from_providers(providers=provider_list)
+            config_dict = run_config.model_dump(mode="json")
+
+            # Write config to disk in providers-run directory
+            distro_dir = DISTRIBS_BASE_DIR / "providers-run"
+            config_file = distro_dir / "run.yaml"
+
+            logger.info(f"Writing generated config to: {config_file}")
+            with open(config_file, "w") as f:
+                yaml.dump(config_dict, f, default_flow_style=False, sort_keys=False)
+
        else:
            config_file = None

@ -106,7 +175,8 @@ class StackRun(Subcommand):

            try:
                config = parse_and_maybe_upgrade_config(config_dict)
-                if not os.path.exists(str(config.external_providers_dir)):
+                # Create external_providers_dir if it's specified and doesn't exist
+                if config.external_providers_dir and not os.path.exists(str(config.external_providers_dir)):
                    os.makedirs(str(config.external_providers_dir), exist_ok=True)
            except AttributeError as e:
                self.parser.error(f"failed to parse config file '{config_file}':\n {e}")
@ -127,7 +197,7 @@ class StackRun(Subcommand):
            config = StackRunConfig(**cast_image_name_to_string(replace_env_vars(config_contents)))

        port = args.port or config.server.port
-        host = config.server.host or ["::", "0.0.0.0"]
+        host = config.server.host or "0.0.0.0"

        # Set the config file in environment so create_app can find it
        os.environ["LLAMA_STACK_CONFIG"] = str(config_file)
@ -139,6 +209,7 @@ class StackRun(Subcommand):
            "lifespan": "on",
            "log_level": logger.getEffectiveLevel(),
            "log_config": logger_config,
+            "workers": config.server.workers,
        }

        keyfile = config.server.tls_keyfile
@ -168,7 +239,7 @@ class StackRun(Subcommand):
        # Another approach would be to ignore SIGINT entirely - let uvicorn handle it through its own
        # signal handling but this is quite intrusive and not worth the effort.
        try:
-            uvicorn.run("llama_stack.core.server.server:create_app", **uvicorn_config)
+            uvicorn.run("llama_stack.core.server.server:create_app", **uvicorn_config)  # type: ignore[arg-type]
        except (KeyboardInterrupt, SystemExit):
            logger.info("Received interrupt signal, shutting down gracefully...")

@ -212,3 +283,44 @@ class StackRun(Subcommand):
            )
        except Exception as e:
            logger.error(f"Failed to start UI development server in {ui_dir}: {e}")
+
+    def _generate_run_config_from_providers(self, providers: dict[str, list[Provider]]):
+        apis = list(providers.keys())
+        distro_dir = DISTRIBS_BASE_DIR / "providers-run"
+        # need somewhere to put the storage.
+        os.makedirs(distro_dir, exist_ok=True)
+        storage = StorageConfig(
+            backends={
+                "kv_default": SqliteKVStoreConfig(
+                    db_path=f"${{env.SQLITE_STORE_DIR:={distro_dir}}}/kvstore.db",
+                ),
+                "sql_default": SqliteSqlStoreConfig(
+                    db_path=f"${{env.SQLITE_STORE_DIR:={distro_dir}}}/sql_store.db",
+                ),
+            },
+            stores=ServerStoresConfig(
+                metadata=KVStoreReference(
+                    backend="kv_default",
+                    namespace="registry",
+                ),
+                inference=InferenceStoreReference(
+                    backend="sql_default",
+                    table_name="inference_store",
+                ),
+                conversations=SqlStoreReference(
+                    backend="sql_default",
+                    table_name="openai_conversations",
+                ),
+                prompts=KVStoreReference(
+                    backend="kv_default",
+                    namespace="prompts",
+                ),
+            ),
+        )
+
+        return StackRunConfig(
+            image_name="providers-run",
+            apis=apis,
+            providers=providers,
+            storage=storage,
+        )
--- a/src/llama_stack/core/configure.py
+++ b/src/llama_stack/core/configure.py
@ -17,7 +17,6 @@ from llama_stack.core.distribution import (
    get_provider_registry,
 )
 from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars
-from llama_stack.core.utils.config_dirs import EXTERNAL_PROVIDERS_DIR
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.core.utils.prompt_for_config import prompt_for_config
 from llama_stack.log import get_logger
@ -194,19 +193,11 @@ def upgrade_from_routing_table(


 def parse_and_maybe_upgrade_config(config_dict: dict[str, Any]) -> StackRunConfig:
-    version = config_dict.get("version", None)
-    if version == LLAMA_STACK_RUN_CONFIG_VERSION:
-        processed_config_dict = replace_env_vars(config_dict)
-        return StackRunConfig(**cast_image_name_to_string(processed_config_dict))
-
    if "routing_table" in config_dict:
        logger.info("Upgrading config...")
        config_dict = upgrade_from_routing_table(config_dict)

    config_dict["version"] = LLAMA_STACK_RUN_CONFIG_VERSION

-    if not config_dict.get("external_providers_dir", None):
-        config_dict["external_providers_dir"] = EXTERNAL_PROVIDERS_DIR
-
    processed_config_dict = replace_env_vars(config_dict)
    return StackRunConfig(**cast_image_name_to_string(processed_config_dict))
--- a/src/llama_stack/core/datatypes.py
+++ b/src/llama_stack/core/datatypes.py
@ -473,6 +473,10 @@ class ServerConfig(BaseModel):
        "- true: Enable localhost CORS for development\n"
        "- {allow_origins: [...], allow_methods: [...], ...}: Full configuration",
    )
+    workers: int = Field(
+        default=1,
+        description="Number of workers to use for the server",
+    )


 class StackRunConfig(BaseModel):
--- a/src/llama_stack/core/inspect.py
+++ b/src/llama_stack/core/inspect.py
@ -15,6 +15,7 @@ from llama_stack.apis.inspect import (
    RouteInfo,
    VersionInfo,
 )
+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.core.datatypes import StackRunConfig
 from llama_stack.core.external import load_external_apis
 from llama_stack.core.server.routes import get_all_api_routes
@ -39,9 +40,21 @@ class DistributionInspectImpl(Inspect):
    async def initialize(self) -> None:
        pass

-    async def list_routes(self) -> ListRoutesResponse:
+    async def list_routes(self, api_filter: str | None = None) -> ListRoutesResponse:
        run_config: StackRunConfig = self.config.run_config

+        # Helper function to determine if a route should be included based on api_filter
+        def should_include_route(webmethod) -> bool:
+            if api_filter is None:
+                # Default: only non-deprecated v1 APIs
+                return not webmethod.deprecated and webmethod.level == LLAMA_STACK_API_V1
+            elif api_filter == "deprecated":
+                # Special filter: show deprecated routes regardless of their actual level
+                return bool(webmethod.deprecated)
+            else:
+                # Filter by API level (non-deprecated routes only)
+                return not webmethod.deprecated and webmethod.level == api_filter
+
        ret = []
        external_apis = load_external_apis(run_config)
        all_endpoints = get_all_api_routes(external_apis)
@ -55,8 +68,8 @@ class DistributionInspectImpl(Inspect):
                            method=next(iter([m for m in e.methods if m != "HEAD"])),
                            provider_types=[],  # These APIs don't have "real" providers - they're internal to the stack
                        )
-                        for e, _ in endpoints
-                        if e.methods is not None
+                        for e, webmethod in endpoints
+                        if e.methods is not None and should_include_route(webmethod)
                    ]
                )
            else:
@ -69,8 +82,8 @@ class DistributionInspectImpl(Inspect):
                                method=next(iter([m for m in e.methods if m != "HEAD"])),
                                provider_types=[p.provider_type for p in providers],
                            )
-                            for e, _ in endpoints
-                            if e.methods is not None
+                            for e, webmethod in endpoints
+                            if e.methods is not None and should_include_route(webmethod)
                        ]
                    )

--- a/src/llama_stack/core/routers/inference.py
+++ b/src/llama_stack/core/routers/inference.py
@ -6,7 +6,7 @@

 import asyncio
 import time
-from collections.abc import AsyncGenerator, AsyncIterator
+from collections.abc import AsyncIterator
 from datetime import UTC, datetime
 from typing import Annotated, Any

@ -15,20 +15,10 @@ from openai.types.chat import ChatCompletionToolChoiceOptionParam as OpenAIChatC
 from openai.types.chat import ChatCompletionToolParam as OpenAIChatCompletionToolParam
 from pydantic import TypeAdapter

-from llama_stack.apis.common.content_types import (
-    InterleavedContent,
-)
 from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
 from llama_stack.apis.inference import (
-    ChatCompletionResponse,
-    ChatCompletionResponseEventType,
-    ChatCompletionResponseStreamChunk,
-    CompletionMessage,
-    CompletionResponse,
-    CompletionResponseStreamChunk,
    Inference,
    ListOpenAIChatCompletionResponse,
-    Message,
    OpenAIAssistantMessageParam,
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
@ -45,15 +35,13 @@ from llama_stack.apis.inference import (
    OpenAIMessageParam,
    Order,
    RerankResponse,
-    StopReason,
-    ToolPromptFormat,
 )
 from llama_stack.apis.inference.inference import (
    OpenAIChatCompletionContentPartImageParam,
    OpenAIChatCompletionContentPartTextParam,
 )
-from llama_stack.apis.models import Model, ModelType
-from llama_stack.core.telemetry.telemetry import MetricEvent, MetricInResponse
+from llama_stack.apis.models import ModelType
+from llama_stack.core.telemetry.telemetry import MetricEvent
 from llama_stack.core.telemetry.tracing import enqueue_event, get_current_span
 from llama_stack.log import get_logger
 from llama_stack.models.llama.llama3.chat_format import ChatFormat
@ -110,7 +98,8 @@ class InferenceRouter(Inference):
        prompt_tokens: int,
        completion_tokens: int,
        total_tokens: int,
-        model: Model,
+        fully_qualified_model_id: str,
+        provider_id: str,
    ) -> list[MetricEvent]:
        """Constructs a list of MetricEvent objects containing token usage metrics.

@ -118,7 +107,8 @@ class InferenceRouter(Inference):
            prompt_tokens: Number of tokens in the prompt
            completion_tokens: Number of tokens in the completion
            total_tokens: Total number of tokens used
-            model: Model object containing model_id and provider_id
+            fully_qualified_model_id:
+            provider_id: The provider identifier

        Returns:
            List of MetricEvent objects with token usage metrics
@ -144,48 +134,32 @@ class InferenceRouter(Inference):
                    timestamp=datetime.now(UTC),
                    unit="tokens",
                    attributes={
-                        "model_id": model.model_id,
-                        "provider_id": model.provider_id,
+                        "model_id": fully_qualified_model_id,
+                        "provider_id": provider_id,
                    },
                )
            )
        return metric_events

-    async def _compute_and_log_token_usage(
-        self,
-        prompt_tokens: int,
-        completion_tokens: int,
-        total_tokens: int,
-        model: Model,
-    ) -> list[MetricInResponse]:
-        metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model)
-        if self.telemetry_enabled:
-            for metric in metrics:
-                enqueue_event(metric)
-        return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
-
-    async def _count_tokens(
-        self,
-        messages: list[Message] | InterleavedContent,
-        tool_prompt_format: ToolPromptFormat | None = None,
-    ) -> int | None:
-        if not hasattr(self, "formatter") or self.formatter is None:
-            return None
-
-        if isinstance(messages, list):
-            encoded = self.formatter.encode_dialog_prompt(messages, tool_prompt_format)
-        else:
-            encoded = self.formatter.encode_content(messages)
-        return len(encoded.tokens) if encoded and encoded.tokens else 0
-
-    async def _get_model(self, model_id: str, expected_model_type: str) -> Model:
-        """takes a model id and gets model after ensuring that it is accessible and of the correct type"""
-        model = await self.routing_table.get_model(model_id)
-        if model is None:
-            raise ModelNotFoundError(model_id)
+    async def _get_model_provider(self, model_id: str, expected_model_type: str) -> tuple[Inference, str]:
+        model = await self.routing_table.get_object_by_identifier("model", model_id)
+        if model:
            if model.model_type != expected_model_type:
                raise ModelTypeError(model_id, model.model_type, expected_model_type)
-        return model
+
+            provider = await self.routing_table.get_provider_impl(model.identifier)
+            return provider, model.provider_resource_id
+
+        splits = model_id.split("/", maxsplit=1)
+        if len(splits) != 2:
+            raise ModelNotFoundError(model_id)
+
+        provider_id, provider_resource_id = splits
+        if provider_id not in self.routing_table.impls_by_provider_id:
+            logger.warning(f"Provider {provider_id} not found for model {model_id}")
+            raise ModelNotFoundError(model_id)
+
+        return self.routing_table.impls_by_provider_id[provider_id], provider_resource_id

    async def rerank(
        self,
@ -195,14 +169,8 @@ class InferenceRouter(Inference):
        max_num_results: int | None = None,
    ) -> RerankResponse:
        logger.debug(f"InferenceRouter.rerank: {model}")
-        model_obj = await self._get_model(model, ModelType.rerank)
-        provider = await self.routing_table.get_provider_impl(model_obj.identifier)
-        return await provider.rerank(
-            model=model_obj.identifier,
-            query=query,
-            items=items,
-            max_num_results=max_num_results,
-        )
+        provider, provider_resource_id = await self._get_model_provider(model, ModelType.rerank)
+        return await provider.rerank(provider_resource_id, query, items, max_num_results)

    async def openai_completion(
        self,
@ -211,24 +179,24 @@ class InferenceRouter(Inference):
        logger.debug(
            f"InferenceRouter.openai_completion: model={params.model}, stream={params.stream}, prompt={params.prompt}",
        )
-        model_obj = await self._get_model(params.model, ModelType.llm)
+        request_model_id = params.model
+        provider, provider_resource_id = await self._get_model_provider(params.model, ModelType.llm)
+        params.model = provider_resource_id

-        # Update params with the resolved model identifier
-        params.model = model_obj.identifier
-
-        provider = await self.routing_table.get_provider_impl(model_obj.identifier)
        if params.stream:
            return await provider.openai_completion(params)
            # TODO: Metrics do NOT work with openai_completion stream=True due to the fact
            # that we do not return an AsyncIterator, our tests expect a stream of chunks we cannot intercept currently.

        response = await provider.openai_completion(params)
+        response.model = request_model_id
        if self.telemetry_enabled:
            metrics = self._construct_metrics(
                prompt_tokens=response.usage.prompt_tokens,
                completion_tokens=response.usage.completion_tokens,
                total_tokens=response.usage.total_tokens,
-                model=model_obj,
+                fully_qualified_model_id=request_model_id,
+                provider_id=provider.__provider_id__,
            )
            for metric in metrics:
                enqueue_event(metric)
@ -246,7 +214,9 @@ class InferenceRouter(Inference):
        logger.debug(
            f"InferenceRouter.openai_chat_completion: model={params.model}, stream={params.stream}, messages={params.messages}",
        )
-        model_obj = await self._get_model(params.model, ModelType.llm)
+        request_model_id = params.model
+        provider, provider_resource_id = await self._get_model_provider(params.model, ModelType.llm)
+        params.model = provider_resource_id

        # Use the OpenAI client for a bit of extra input validation without
        # exposing the OpenAI client itself as part of our API surface
@ -264,10 +234,6 @@ class InferenceRouter(Inference):
            params.tool_choice = None
            params.tools = None

-        # Update params with the resolved model identifier
-        params.model = model_obj.identifier
-
-        provider = await self.routing_table.get_provider_impl(model_obj.identifier)
        if params.stream:
            response_stream = await provider.openai_chat_completion(params)

@ -275,11 +241,13 @@ class InferenceRouter(Inference):
            # We need to add metrics to each chunk and store the final completion
            return self.stream_tokens_and_compute_metrics_openai_chat(
                response=response_stream,
-                model=model_obj,
+                fully_qualified_model_id=request_model_id,
+                provider_id=provider.__provider_id__,
                messages=params.messages,
            )

        response = await self._nonstream_openai_chat_completion(provider, params)
+        response.model = request_model_id

        # Store the response with the ID that will be returned to the client
        if self.store:
@ -290,7 +258,8 @@ class InferenceRouter(Inference):
                prompt_tokens=response.usage.prompt_tokens,
                completion_tokens=response.usage.completion_tokens,
                total_tokens=response.usage.total_tokens,
-                model=model_obj,
+                fully_qualified_model_id=request_model_id,
+                provider_id=provider.__provider_id__,
            )
            for metric in metrics:
                enqueue_event(metric)
@ -307,13 +276,13 @@ class InferenceRouter(Inference):
        logger.debug(
            f"InferenceRouter.openai_embeddings: model={params.model}, input_type={type(params.input)}, encoding_format={params.encoding_format}, dimensions={params.dimensions}",
        )
-        model_obj = await self._get_model(params.model, ModelType.embedding)
+        request_model_id = params.model
+        provider, provider_resource_id = await self._get_model_provider(params.model, ModelType.embedding)
+        params.model = provider_resource_id

-        # Update model to use resolved identifier
-        params.model = model_obj.identifier
-
-        provider = await self.routing_table.get_provider_impl(model_obj.identifier)
-        return await provider.openai_embeddings(params)
+        response = await provider.openai_embeddings(params)
+        response.model = request_model_id
+        return response

    async def list_chat_completions(
        self,
@ -365,119 +334,11 @@ class InferenceRouter(Inference):
                )
        return health_statuses

-    async def stream_tokens_and_compute_metrics(
-        self,
-        response,
-        prompt_tokens,
-        model,
-        tool_prompt_format: ToolPromptFormat | None = None,
-    ) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None] | AsyncGenerator[CompletionResponseStreamChunk, None]:
-        completion_text = ""
-        async for chunk in response:
-            complete = False
-            if hasattr(chunk, "event"):  # only ChatCompletions have .event
-                if chunk.event.event_type == ChatCompletionResponseEventType.progress:
-                    if chunk.event.delta.type == "text":
-                        completion_text += chunk.event.delta.text
-                if chunk.event.event_type == ChatCompletionResponseEventType.complete:
-                    complete = True
-                    completion_tokens = await self._count_tokens(
-                        [
-                            CompletionMessage(
-                                content=completion_text,
-                                stop_reason=StopReason.end_of_turn,
-                            )
-                        ],
-                        tool_prompt_format=tool_prompt_format,
-                    )
-            else:
-                if hasattr(chunk, "delta"):
-                    completion_text += chunk.delta
-                if hasattr(chunk, "stop_reason") and chunk.stop_reason and self.telemetry_enabled:
-                    complete = True
-                    completion_tokens = await self._count_tokens(completion_text)
-            # if we are done receiving tokens
-            if complete:
-                total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
-
-                # Create a separate span for streaming completion metrics
-                if self.telemetry_enabled:
-                    # Log metrics in the new span context
-                    completion_metrics = self._construct_metrics(
-                        prompt_tokens=prompt_tokens,
-                        completion_tokens=completion_tokens,
-                        total_tokens=total_tokens,
-                        model=model,
-                    )
-                    for metric in completion_metrics:
-                        if metric.metric in [
-                            "completion_tokens",
-                            "total_tokens",
-                        ]:  # Only log completion and total tokens
-                            enqueue_event(metric)
-
-                        # Return metrics in response
-                        async_metrics = [
-                            MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics
-                        ]
-                        chunk.metrics = async_metrics if chunk.metrics is None else chunk.metrics + async_metrics
-                else:
-                    # Fallback if no telemetry
-                    completion_metrics = self._construct_metrics(
-                        prompt_tokens or 0,
-                        completion_tokens or 0,
-                        total_tokens,
-                        model,
-                    )
-                    async_metrics = [
-                        MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics
-                    ]
-                    chunk.metrics = async_metrics if chunk.metrics is None else chunk.metrics + async_metrics
-            yield chunk
-
-    async def count_tokens_and_compute_metrics(
-        self,
-        response: ChatCompletionResponse | CompletionResponse,
-        prompt_tokens,
-        model,
-        tool_prompt_format: ToolPromptFormat | None = None,
-    ):
-        if isinstance(response, ChatCompletionResponse):
-            content = [response.completion_message]
-        else:
-            content = response.content
-        completion_tokens = await self._count_tokens(messages=content, tool_prompt_format=tool_prompt_format)
-        total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
-
-        # Create a separate span for completion metrics
-        if self.telemetry_enabled:
-            # Log metrics in the new span context
-            completion_metrics = self._construct_metrics(
-                prompt_tokens=prompt_tokens,
-                completion_tokens=completion_tokens,
-                total_tokens=total_tokens,
-                model=model,
-            )
-            for metric in completion_metrics:
-                if metric.metric in ["completion_tokens", "total_tokens"]:  # Only log completion and total tokens
-                    enqueue_event(metric)
-
-            # Return metrics in response
-            return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics]
-
-        # Fallback if no telemetry
-        metrics = self._construct_metrics(
-            prompt_tokens or 0,
-            completion_tokens or 0,
-            total_tokens,
-            model,
-        )
-        return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
-
    async def stream_tokens_and_compute_metrics_openai_chat(
        self,
        response: AsyncIterator[OpenAIChatCompletionChunk],
-        model: Model,
+        fully_qualified_model_id: str,
+        provider_id: str,
        messages: list[OpenAIMessageParam] | None = None,
    ) -> AsyncIterator[OpenAIChatCompletionChunk]:
        """Stream OpenAI chat completion chunks, compute metrics, and store the final completion."""
@ -497,6 +358,8 @@ class InferenceRouter(Inference):
                if created is None and chunk.created:
                    created = chunk.created

+                chunk.model = fully_qualified_model_id
+
                # Accumulate choice data for final assembly
                if chunk.choices:
                    for choice_delta in chunk.choices:
@ -553,7 +416,8 @@ class InferenceRouter(Inference):
                            prompt_tokens=chunk.usage.prompt_tokens,
                            completion_tokens=chunk.usage.completion_tokens,
                            total_tokens=chunk.usage.total_tokens,
-                            model=model,
+                            model_id=fully_qualified_model_id,
+                            provider_id=provider_id,
                        )
                        for metric in metrics:
                            enqueue_event(metric)
@ -601,7 +465,7 @@ class InferenceRouter(Inference):
                    id=id,
                    choices=assembled_choices,
                    created=created or int(time.time()),
-                    model=model.identifier,
+                    model=fully_qualified_model_id,
                    object="chat.completion",
                )
                logger.debug(f"InferenceRouter.completion_response: {final_response}")
--- a/src/llama_stack/core/routing_tables/models.py
+++ b/src/llama_stack/core/routing_tables/models.py
@ -13,6 +13,8 @@ from llama_stack.core.datatypes import (
    ModelWithOwner,
    RegistryEntrySource,
 )
+from llama_stack.core.request_headers import PROVIDER_DATA_VAR, NeedsRequestProviderData
+from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.log import get_logger

 from .common import CommonRoutingTableImpl, lookup_model
@ -42,19 +44,104 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):

            await self.update_registered_models(provider_id, models)

+    async def _get_dynamic_models_from_provider_data(self) -> list[Model]:
+        """
+        Fetch models from providers that have credentials in the current request's provider_data.
+
+        This allows users to see models available to them from providers that require
+        per-request API keys (via X-LlamaStack-Provider-Data header).
+
+        Returns models with fully qualified identifiers (provider_id/model_id) but does NOT
+        cache them in the registry since they are user-specific.
+        """
+        provider_data = PROVIDER_DATA_VAR.get()
+        if not provider_data:
+            return []
+
+        dynamic_models = []
+
+        for provider_id, provider in self.impls_by_provider_id.items():
+            # Check if this provider supports provider_data
+            if not isinstance(provider, NeedsRequestProviderData):
+                continue
+
+            # Check if provider has a validator (some providers like ollama don't need per-request credentials)
+            spec = getattr(provider, "__provider_spec__", None)
+            if not spec or not getattr(spec, "provider_data_validator", None):
+                continue
+
+            # Validate provider_data silently - we're speculatively checking all providers
+            # so validation failures are expected when user didn't provide keys for this provider
+            try:
+                validator = instantiate_class_type(spec.provider_data_validator)
+                validator(**provider_data)
+            except Exception:
+                # User didn't provide credentials for this provider - skip silently
+                continue
+
+            # Validation succeeded! User has credentials for this provider
+            # Now try to list models
+            try:
+                models = await provider.list_models()
+                if not models:
+                    continue
+
+                # Ensure models have fully qualified identifiers with provider_id prefix
+                for model in models:
+                    # Only add prefix if model identifier doesn't already have it
+                    if not model.identifier.startswith(f"{provider_id}/"):
+                        model.identifier = f"{provider_id}/{model.provider_resource_id}"
+
+                    dynamic_models.append(model)
+
+                logger.debug(f"Fetched {len(models)} models from provider {provider_id} using provider_data")
+
+            except Exception as e:
+                logger.debug(f"Failed to list models from provider {provider_id} with provider_data: {e}")
+                continue
+
+        return dynamic_models
+
    async def list_models(self) -> ListModelsResponse:
-        return ListModelsResponse(data=await self.get_all_with_type("model"))
+        # Get models from registry
+        registry_models = await self.get_all_with_type("model")
+
+        # Get additional models available via provider_data (user-specific, not cached)
+        dynamic_models = await self._get_dynamic_models_from_provider_data()
+
+        # Combine, avoiding duplicates (registry takes precedence)
+        registry_identifiers = {m.identifier for m in registry_models}
+        unique_dynamic_models = [m for m in dynamic_models if m.identifier not in registry_identifiers]
+
+        return ListModelsResponse(data=registry_models + unique_dynamic_models)

    async def openai_list_models(self) -> OpenAIListModelsResponse:
-        models = await self.get_all_with_type("model")
+        # Get models from registry
+        registry_models = await self.get_all_with_type("model")
+
+        # Get additional models available via provider_data (user-specific, not cached)
+        dynamic_models = await self._get_dynamic_models_from_provider_data()
+
+        # Combine, avoiding duplicates (registry takes precedence)
+        registry_identifiers = {m.identifier for m in registry_models}
+        unique_dynamic_models = [m for m in dynamic_models if m.identifier not in registry_identifiers]
+
+        all_models = registry_models + unique_dynamic_models
+
        openai_models = [
            OpenAIModel(
                id=model.identifier,
                object="model",
                created=int(time.time()),
                owned_by="llama_stack",
+                custom_metadata={
+                    "model_type": model.model_type,
+                    "provider_id": model.provider_id,
+                    "provider_resource_id": model.provider_resource_id,
+                    **model.metadata,
+                },
            )
-            for model in models
+            for model in all_models
        ]
        return OpenAIListModelsResponse(data=openai_models)

--- a/src/llama_stack/core/server/auth_providers.py
+++ b/src/llama_stack/core/server/auth_providers.py
@ -6,6 +6,7 @@

 import ssl
 from abc import ABC, abstractmethod
+from typing import Any
 from urllib.parse import parse_qs, urljoin, urlparse

 import httpx
@ -143,14 +144,21 @@ class OAuth2TokenAuthProvider(AuthProvider):
            if self.config.jwks and self.config.jwks.token:
                headers["Authorization"] = f"Bearer {self.config.jwks.token}"

-            self._jwks_client = jwt.PyJWKClient(
-                self.config.jwks.uri if self.config.jwks else None,
-                cache_keys=True,
-                max_cached_keys=10,
-                lifespan=self.config.jwks.key_recheck_period if self.config.jwks else None,
-                headers=headers,
-                ssl_context=ssl_context,
-            )
+            # Ensure uri is not None for PyJWKClient
+            if not self.config.jwks or not self.config.jwks.uri:
+                raise ValueError("JWKS configuration requires a valid URI")
+
+            # Build kwargs conditionally to avoid passing None values
+            jwks_kwargs: dict[str, Any] = {
+                "cache_keys": True,
+                "max_cached_keys": 10,
+                "headers": headers,
+                "ssl_context": ssl_context,
+            }
+            if self.config.jwks.key_recheck_period is not None:
+                jwks_kwargs["lifespan"] = self.config.jwks.key_recheck_period
+
+            self._jwks_client = jwt.PyJWKClient(self.config.jwks.uri, **jwks_kwargs)
        return self._jwks_client

    async def validate_jwt_token(self, token: str, scope: dict | None = None) -> User:
@ -197,23 +205,31 @@ class OAuth2TokenAuthProvider(AuthProvider):
        if self.config.introspection is None:
            raise ValueError("Introspection is not configured")

+        # ssl_ctxt can be None, bool, str, or SSLContext - httpx accepts all
+        ssl_ctxt: ssl.SSLContext | bool = False  # Default to no verification if no cafile
+        if self.config.tls_cafile:
+            ssl_ctxt = ssl.create_default_context(cafile=self.config.tls_cafile.as_posix())
+
+        # Build post kwargs conditionally based on auth method
+        post_kwargs: dict[str, Any] = {
+            "url": self.config.introspection.url,
+            "data": form,
+            "timeout": 10.0,
+        }
+
        if self.config.introspection.send_secret_in_body:
            form["client_id"] = self.config.introspection.client_id
            form["client_secret"] = self.config.introspection.client_secret
-            auth = None
        else:
-            auth = (self.config.introspection.client_id, self.config.introspection.client_secret)
-        ssl_ctxt = None
-        if self.config.tls_cafile:
-            ssl_ctxt = ssl.create_default_context(cafile=self.config.tls_cafile.as_posix())
+            # httpx auth parameter expects tuple[str | bytes, str | bytes]
+            post_kwargs["auth"] = (
+                self.config.introspection.client_id,
+                self.config.introspection.client_secret,
+            )
+
        try:
            async with httpx.AsyncClient(verify=ssl_ctxt) as client:
-                response = await client.post(
-                    self.config.introspection.url,
-                    data=form,
-                    auth=auth,
-                    timeout=10.0,  # Add a reasonable timeout
-                )
+                response = await client.post(**post_kwargs)
                if response.status_code != httpx.codes.OK:
                    logger.warning(f"Token introspection failed with status code: {response.status_code}")
                    raise ValueError(f"Token introspection failed: {response.status_code}")
--- a/src/llama_stack/core/server/routes.py
+++ b/src/llama_stack/core/server/routes.py
@ -68,8 +68,9 @@ def get_all_api_routes(
                else:
                    http_method = hdrs.METH_POST
                routes.append(
-                    (Route(path=path, methods=[http_method], name=name, endpoint=None), webmethod)
-                )  # setting endpoint to None since don't use a Router object
+                    # setting endpoint to None since don't use a Router object
+                    (Route(path=path, methods=[http_method], name=name, endpoint=None), webmethod)  # type: ignore[arg-type]
+                )

        apis[api] = routes

@ -98,7 +99,7 @@ def initialize_route_impls(impls, external_apis: dict[Api, ExternalApiSpec] | No
            impl = impls[api]
            func = getattr(impl, route.name)
            # Get the first (and typically only) method from the set, filtering out HEAD
-            available_methods = [m for m in route.methods if m != "HEAD"]
+            available_methods = [m for m in (route.methods or []) if m != "HEAD"]
            if not available_methods:
                continue  # Skip if only HEAD method is available
            method = available_methods[0].lower()
--- a/src/llama_stack/core/stack.py
+++ b/src/llama_stack/core/stack.py
@ -14,6 +14,7 @@ from typing import Any
 import yaml

 from llama_stack.apis.agents import Agents
+from llama_stack.apis.batches import Batches
 from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.conversations import Conversations
 from llama_stack.apis.datasetio import DatasetIO
@ -30,7 +31,6 @@ from llama_stack.apis.safety import Safety
 from llama_stack.apis.scoring import Scoring
 from llama_stack.apis.scoring_functions import ScoringFunctions
 from llama_stack.apis.shields import Shields
-from llama_stack.apis.synthetic_data_generation import SyntheticDataGeneration
 from llama_stack.apis.tools import RAGToolRuntime, ToolGroups, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.core.conversations.conversations import ConversationServiceConfig, ConversationServiceImpl
@ -63,8 +63,8 @@ class LlamaStack(
    Providers,
    Inference,
    Agents,
+    Batches,
    Safety,
-    SyntheticDataGeneration,
    Datasets,
    PostTraining,
    VectorIO,
--- a/src/llama_stack/core/telemetry/telemetry.py
+++ b/src/llama_stack/core/telemetry/telemetry.py
@ -6,12 +6,14 @@

 import os
 import threading
+from collections.abc import Mapping, Sequence
 from datetime import datetime
 from enum import Enum
 from typing import (
    Annotated,
    Any,
    Literal,
+    cast,
 )

 from opentelemetry import metrics, trace
@ -30,6 +32,10 @@ from llama_stack.schema_utils import json_schema_type, register_schema

 ROOT_SPAN_MARKERS = ["__root__", "__root_span__"]

+# Type alias for OpenTelemetry attribute values (excludes None)
+AttributeValue = str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]
+Attributes = Mapping[str, AttributeValue]
+

@json_schema_type
 class SpanStatus(Enum):
@ -428,6 +434,13 @@ _TRACER_PROVIDER = None
 logger = get_logger(name=__name__, category="telemetry")


+def _clean_attributes(attrs: dict[str, Any] | None) -> Attributes | None:
+    """Remove None values from attributes dict to match OpenTelemetry's expected type."""
+    if attrs is None:
+        return None
+    return {k: v for k, v in attrs.items() if v is not None}
+
+
 def is_tracing_enabled(tracer):
    with tracer.start_as_current_span("check_tracing") as span:
        return span.is_recording()
@ -456,7 +469,7 @@ class Telemetry:
                # https://opentelemetry.io/docs/languages/sdk-configuration/otlp-exporter
                span_exporter = OTLPSpanExporter()
                span_processor = BatchSpanProcessor(span_exporter)
-                trace.get_tracer_provider().add_span_processor(span_processor)
+                cast(TracerProvider, trace.get_tracer_provider()).add_span_processor(span_processor)

                metric_reader = PeriodicExportingMetricReader(OTLPMetricExporter())
                metric_provider = MeterProvider(metric_readers=[metric_reader])
@ -474,7 +487,7 @@ class Telemetry:

    async def shutdown(self) -> None:
        if self.is_otel_endpoint_set:
-            trace.get_tracer_provider().force_flush()
+            cast(TracerProvider, trace.get_tracer_provider()).force_flush()

    async def log_event(self, event: Event, ttl_seconds: int = 604800) -> None:
        if isinstance(event, UnstructuredLogEvent):
@ -515,7 +528,7 @@ class Telemetry:
                unit=unit,
                description=f"Counter for {name}",
            )
-        return _GLOBAL_STORAGE["counters"][name]
+        return cast(metrics.Counter, _GLOBAL_STORAGE["counters"][name])

    def _get_or_create_gauge(self, name: str, unit: str) -> metrics.ObservableGauge:
        assert self.meter is not None
@ -525,7 +538,7 @@ class Telemetry:
                unit=unit,
                description=f"Gauge for {name}",
            )
-        return _GLOBAL_STORAGE["gauges"][name]
+        return cast(metrics.ObservableGauge, _GLOBAL_STORAGE["gauges"][name])

    def _log_metric(self, event: MetricEvent) -> None:
        # Add metric as an event to the current span
@ -560,10 +573,10 @@ class Telemetry:
            return
        if isinstance(event.value, int):
            counter = self._get_or_create_counter(event.metric, event.unit)
-            counter.add(event.value, attributes=event.attributes)
+            counter.add(event.value, attributes=_clean_attributes(event.attributes))
        elif isinstance(event.value, float):
            up_down_counter = self._get_or_create_up_down_counter(event.metric, event.unit)
-            up_down_counter.add(event.value, attributes=event.attributes)
+            up_down_counter.add(event.value, attributes=_clean_attributes(event.attributes))

    def _get_or_create_up_down_counter(self, name: str, unit: str) -> metrics.UpDownCounter:
        assert self.meter is not None
@ -573,7 +586,7 @@ class Telemetry:
                unit=unit,
                description=f"UpDownCounter for {name}",
            )
-        return _GLOBAL_STORAGE["up_down_counters"][name]
+        return cast(metrics.UpDownCounter, _GLOBAL_STORAGE["up_down_counters"][name])

    def _log_structured(self, event: StructuredLogEvent, ttl_seconds: int) -> None:
        with self._lock:
@ -601,6 +614,7 @@ class Telemetry:
                if event.payload.parent_span_id:
                    parent_span_id = int(event.payload.parent_span_id, 16)
                    parent_span = _GLOBAL_STORAGE["active_spans"].get(parent_span_id)
+                    if parent_span:
                        context = trace.set_span_in_context(parent_span)
                elif traceparent:
                    carrier = {
@ -612,15 +626,17 @@ class Telemetry:
                span = tracer.start_span(
                    name=event.payload.name,
                    context=context,
-                    attributes=event.attributes or {},
+                    attributes=_clean_attributes(event.attributes),
                )
                _GLOBAL_STORAGE["active_spans"][span_id] = span

            elif isinstance(event.payload, SpanEndPayload):
-                span = _GLOBAL_STORAGE["active_spans"].get(span_id)
+                span = _GLOBAL_STORAGE["active_spans"].get(span_id)  # type: ignore[assignment]
                if span:
                    if event.attributes:
-                        span.set_attributes(event.attributes)
+                        cleaned_attrs = _clean_attributes(event.attributes)
+                        if cleaned_attrs:
+                            span.set_attributes(cleaned_attrs)

                    status = (
                        trace.Status(status_code=trace.StatusCode.OK)
--- a/src/llama_stack/core/ui/page/distribution/models.py
+++ b/src/llama_stack/core/ui/page/distribution/models.py
@ -12,7 +12,7 @@ from llama_stack.core.ui.modules.api import llama_stack_api
 def models():
    # Models Section
    st.header("Models")
-    models_info = {m.identifier: m.to_dict() for m in llama_stack_api.client.models.list()}
+    models_info = {m.id: m.model_dump() for m in llama_stack_api.client.models.list()}

    selected_model = st.selectbox("Select a model", list(models_info.keys()))
    st.json(models_info[selected_model])
--- a/src/llama_stack/core/ui/page/playground/chat.py
+++ b/src/llama_stack/core/ui/page/playground/chat.py
@ -12,7 +12,11 @@ from llama_stack.core.ui.modules.api import llama_stack_api
 with st.sidebar:
    st.header("Configuration")
    available_models = llama_stack_api.client.models.list()
-    available_models = [model.identifier for model in available_models if model.model_type == "llm"]
+    available_models = [
+        model.id
+        for model in available_models
+        if model.custom_metadata and model.custom_metadata.get("model_type") == "llm"
+    ]
    selected_model = st.selectbox(
        "Choose a model",
        available_models,
--- a/src/llama_stack/distributions/dell/doc_template.md
+++ b/src/llama_stack/distributions/dell/doc_template.md
@ -152,6 +152,37 @@ docker run \
  --port $LLAMA_STACK_PORT
 ```

+### Via Docker with Custom Run Configuration
+
+You can also run the Docker container with a custom run configuration file by mounting it into the container:
+
+```bash
+# Set the path to your custom run.yaml file
+CUSTOM_RUN_CONFIG=/path/to/your/custom-run.yaml
+
+docker run -it \
+  --pull always \
+  --network host \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v $HOME/.llama:/root/.llama \
+  -v $CUSTOM_RUN_CONFIG:/app/custom-run.yaml \
+  -e RUN_CONFIG_PATH=/app/custom-run.yaml \
+  -e INFERENCE_MODEL=$INFERENCE_MODEL \
+  -e DEH_URL=$DEH_URL \
+  -e CHROMA_URL=$CHROMA_URL \
+  llamastack/distribution-{{ name }} \
+  --port $LLAMA_STACK_PORT
+```
+
+**Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
+
+{% if run_configs %}
+Available run configurations for this distribution:
+{% for config in run_configs %}
+- `{{ config }}`
+{% endfor %}
+{% endif %}
+
 ### Via Conda

 Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
--- a/src/llama_stack/distributions/dell/run-with-safety.yaml
+++ b/src/llama_stack/distributions/dell/run-with-safety.yaml
@ -109,6 +109,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
 registered_resources:
  models:
  - metadata: {}
--- a/src/llama_stack/distributions/meta-reference-gpu/doc_template.md
+++ b/src/llama_stack/distributions/meta-reference-gpu/doc_template.md
@ -68,6 +68,36 @@ docker run \
  --port $LLAMA_STACK_PORT
 ```

+### Via Docker with Custom Run Configuration
+
+You can also run the Docker container with a custom run configuration file by mounting it into the container:
+
+```bash
+# Set the path to your custom run.yaml file
+CUSTOM_RUN_CONFIG=/path/to/your/custom-run.yaml
+LLAMA_STACK_PORT=8321
+
+docker run \
+  -it \
+  --pull always \
+  --gpu all \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  -v $CUSTOM_RUN_CONFIG:/app/custom-run.yaml \
+  -e RUN_CONFIG_PATH=/app/custom-run.yaml \
+  llamastack/distribution-{{ name }} \
+  --port $LLAMA_STACK_PORT
+```
+
+**Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
+
+{% if run_configs %}
+Available run configurations for this distribution:
+{% for config in run_configs %}
+- `{{ config }}`
+{% endfor %}
+{% endif %}
+
 ### Via venv

 Make sure you have the Llama Stack CLI available.
--- a/src/llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml
+++ b/src/llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml
@ -122,6 +122,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
 registered_resources:
  models:
  - metadata: {}
--- a/src/llama_stack/distributions/nvidia/doc_template.md
+++ b/src/llama_stack/distributions/nvidia/doc_template.md
@ -117,13 +117,42 @@ docker run \
  -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
+  -v ~/.llama:/root/.llama \
  -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
  llamastack/distribution-{{ name }} \
-  --config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT
 ```

+### Via Docker with Custom Run Configuration
+
+You can also run the Docker container with a custom run configuration file by mounting it into the container:
+
+```bash
+# Set the path to your custom run.yaml file
+CUSTOM_RUN_CONFIG=/path/to/your/custom-run.yaml
+LLAMA_STACK_PORT=8321
+
+docker run \
+  -it \
+  --pull always \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  -v $CUSTOM_RUN_CONFIG:/app/custom-run.yaml \
+  -e RUN_CONFIG_PATH=/app/custom-run.yaml \
+  -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
+  llamastack/distribution-{{ name }} \
+  --port $LLAMA_STACK_PORT
+```
+
+**Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
+
+{% if run_configs %}
+Available run configurations for this distribution:
+{% for config in run_configs %}
+- `{{ config }}`
+{% endfor %}
+{% endif %}
+
 ### Via venv

 If you've set up your local development environment, you can also install the distribution dependencies using your local virtual environment.
--- a/src/llama_stack/distributions/nvidia/run-with-safety.yaml
+++ b/src/llama_stack/distributions/nvidia/run-with-safety.yaml
@ -111,6 +111,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
 registered_resources:
  models:
  - metadata: {}
--- a/Show more
+++ b/Show more