Merge branch 'main' into llama_stack_how_to_documentation

2025-12-03 01:48:05 +00:00 · 2025-11-26 12:34:19 -08:00 · 2025-11-26 12:34:19 -08:00 · 3661d9f150
commit 3661d9f150
parent 4afa23dbff d1a7bc36a2
3489 changed files with 710200 additions and 535689 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -5,7 +5,7 @@ omit =
    */llama_stack/templates/*
    .venv/*
    */llama_stack/cli/scripts/*
-    */llama_stack/ui/*
+    */llama_stack_ui/*
    */llama_stack/distribution/ui/*
    */llama_stack/strong_typing/*
    */llama_stack/env.py
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,19 @@
+.venv
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+*.so
+.git
+.gitignore
+htmlcov*
+.coverage
+coverage*
+.cache
+.mypy_cache
+.pytest_cache
+.ruff_cache
+uv.lock
+node_modules
+build
+/tmp
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1 @@
+tests/**/recordings/** linguist-generated=true
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@

 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1 @franciscojavierarceo
+* @ashwinb @raghotham @ehhuang @leseb @bbrowning @mattf @franciscojavierarceo @cdoern
--- a/.github/actions/install-llama-stack-client/action.yml
+++ b/.github/actions/install-llama-stack-client/action.yml
@ -0,0 +1,60 @@
+name: Install llama-stack-client
+description: Install llama-stack-client based on branch context and client-version input
+
+inputs:
+  client-version:
+    description: 'Client version to install on non-release branches (latest or published). Ignored on release branches.'
+    required: false
+    default: ""
+
+outputs:
+  uv-extra-index-url:
+    description: 'UV_EXTRA_INDEX_URL to use (set for release branches)'
+    value: ${{ steps.configure.outputs.uv-extra-index-url }}
+  install-after-sync:
+    description: 'Whether to install client after uv sync'
+    value: ${{ steps.configure.outputs.install-after-sync }}
+  install-source:
+    description: 'Where to install client from after sync'
+    value: ${{ steps.configure.outputs.install-source }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: Configure client installation
+      id: configure
+      shell: bash
+      run: |
+        # Determine the branch we're working with
+        BRANCH="${{ github.base_ref || github.ref }}"
+        BRANCH="${BRANCH#refs/heads/}"
+
+        echo "Working with branch: $BRANCH"
+
+        # On release branches: use test.pypi for uv sync, then install from git
+        # On non-release branches: install based on client-version after sync
+        if [[ "$BRANCH" =~ ^release-[0-9]+\.[0-9]+\.x$ ]]; then
+          echo "Detected release branch: $BRANCH"
+
+          # Check if matching branch exists in client repo
+          if ! git ls-remote --exit-code --heads https://github.com/llamastack/llama-stack-client-python.git "$BRANCH" > /dev/null 2>&1; then
+            echo "::error::Branch $BRANCH not found in llama-stack-client-python repository"
+            echo "::error::Please create the matching release branch in llama-stack-client-python before testing"
+            exit 1
+          fi
+
+          # Configure to use test.pypi as extra index (PyPI is primary)
+          echo "uv-extra-index-url=https://test.pypi.org/simple/" >> $GITHUB_OUTPUT
+          echo "install-after-sync=true" >> $GITHUB_OUTPUT
+          echo "install-source=git+https://github.com/llamastack/llama-stack-client-python.git@$BRANCH" >> $GITHUB_OUTPUT
+        elif [ "${{ inputs.client-version }}" = "latest" ]; then
+          # Install from main git after sync
+          echo "install-after-sync=true" >> $GITHUB_OUTPUT
+          echo "install-source=git+https://github.com/llamastack/llama-stack-client-python.git@main" >> $GITHUB_OUTPUT
+        elif [ "${{ inputs.client-version }}" = "published" ]; then
+          # Use published version from PyPI (installed by sync)
+          echo "install-after-sync=false" >> $GITHUB_OUTPUT
+        elif [ -n "${{ inputs.client-version }}" ]; then
+          echo "::error::Invalid client-version: ${{ inputs.client-version }}"
+          exit 1
+        fi
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -54,6 +54,10 @@ runs:
          SCRIPT_ARGS="$SCRIPT_ARGS --pattern ${{ inputs.pattern }}"
        fi

+        echo "=== Running command ==="
+        echo "uv run --no-sync ./scripts/integration-tests.sh $SCRIPT_ARGS"
+        echo ""
+
        uv run --no-sync ./scripts/integration-tests.sh $SCRIPT_ARGS | tee pytest-${{ inputs.inference-mode }}.log


@ -62,13 +66,14 @@ runs:
      shell: bash
      run: |
        echo "Checking for recording changes"
-        git status --porcelain tests/integration/recordings/
+        git status --porcelain tests/integration/

-        if [[ -n $(git status --porcelain tests/integration/recordings/) ]]; then
+        if [[ -n $(git status --porcelain tests/integration/) ]]; then
          echo "New recordings detected, committing and pushing"
-          git add tests/integration/recordings/
+          git add tests/integration/
+
+          git commit -m "Recordings update from CI (setup: ${{ inputs.setup }}, suite: ${{ inputs.suite }})"

-          git commit -m "Recordings update from CI (suite: ${{ inputs.suite }})"
          git fetch origin ${{ github.ref_name }}
          git rebase origin/${{ github.ref_name }}
          echo "Rebased successfully"
@ -78,17 +83,21 @@ runs:
          echo "No recording changes"
        fi

-    - name: Write inference logs to file
+    - name: Write docker logs to file
      if: ${{ always() }}
      shell: bash
      run: |
-        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log || true
+        # Ollama logs (if ollama container exists)
+        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log 2>&1 || true
+        # vllm logs (if vllm container exists)
+        sudo docker logs vllm > vllm-${{ inputs.inference-mode }}.log 2>&1 || true
+        # Note: distro container logs are now dumped in integration-tests.sh before container is removed

    - name: Upload logs
      if: ${{ always() }}
      uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
      with:
-        name: logs-${{ github.run_id }}-${{ github.run_attempt || '' }}-${{ strategy.job-index }}
+        name: logs-${{ github.run_id }}-${{ github.run_attempt || '1' }}-${{ strategy.job-index || github.job }}-${{ github.action }}
        path: |
          *.log
        retention-days: 1
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -18,25 +18,35 @@ runs:
        python-version: ${{ inputs.python-version }}
        version: 0.7.6

+    - name: Configure client installation
+      id: client-config
+      uses: ./.github/actions/install-llama-stack-client
+      with:
+        client-version: ${{ inputs.client-version }}
+
    - name: Install dependencies
      shell: bash
+      env:
+        UV_EXTRA_INDEX_URL: ${{ steps.client-config.outputs.uv-extra-index-url }}
      run: |
+        # Export UV env vars for current step and persist to GITHUB_ENV for subsequent steps
+        if [ -n "$UV_EXTRA_INDEX_URL" ]; then
+          export UV_INDEX_STRATEGY=unsafe-best-match
+          echo "UV_EXTRA_INDEX_URL=$UV_EXTRA_INDEX_URL" >> $GITHUB_ENV
+          echo "UV_INDEX_STRATEGY=$UV_INDEX_STRATEGY" >> $GITHUB_ENV
+          echo "Exported UV environment variables for current and subsequent steps"
+        fi
+
        echo "Updating project dependencies via uv sync"
        uv sync --all-groups

        echo "Installing ad-hoc dependencies"
        uv pip install faiss-cpu

-        # Install llama-stack-client-python based on the client-version input
-        if [ "${{ inputs.client-version }}" = "latest" ]; then
-          echo "Installing latest llama-stack-client-python from main branch"
-          uv pip install git+https://github.com/llamastack/llama-stack-client-python.git@main
-        elif [ "${{ inputs.client-version }}" = "published" ]; then
-          echo "Installing published llama-stack-client-python from PyPI"
-          uv pip install llama-stack-client
-        else
-          echo "Invalid client-version: ${{ inputs.client-version }}"
-          exit 1
+        # Install specific client version after sync if needed
+        if [ "${{ steps.client-config.outputs.install-after-sync }}" = "true" ]; then
+          echo "Installing llama-stack-client from: ${{ steps.client-config.outputs.install-source }}"
+          uv pip install ${{ steps.client-config.outputs.install-source }}
        fi

        echo "Installed llama packages"
--- a/.github/actions/setup-test-environment/action.yml
+++ b/.github/actions/setup-test-environment/action.yml
@ -39,25 +39,40 @@ runs:
      if: ${{ inputs.setup == 'vllm' && inputs.inference-mode == 'record' }}
      uses: ./.github/actions/setup-vllm

+    - name: Start Postgres service
+      if: ${{ contains(inputs.setup, 'postgres') }}
+      shell: bash
+      run: |
+        sudo docker rm -f postgres-ci || true
+        sudo docker run -d --name postgres-ci \
+          -e POSTGRES_USER=llamastack \
+          -e POSTGRES_PASSWORD=llamastack \
+          -e POSTGRES_DB=llamastack \
+          -p 5432:5432 \
+          postgres:16
+
+        echo "Waiting for Postgres to become ready..."
+        for i in {1..30}; do
+          if sudo docker exec postgres-ci pg_isready -U llamastack -d llamastack >/dev/null 2>&1; then
+            echo "Postgres is ready"
+            break
+          fi
+          if [ "$i" -eq 30 ]; then
+            echo "Postgres failed to start in time"
+            sudo docker logs postgres-ci || true
+            exit 1
+          fi
+          sleep 2
+        done
+
    - name: Build Llama Stack
      shell: bash
      run: |
-        # Install llama-stack-client-python based on the client-version input
-        if [ "${{ inputs.client-version }}" = "latest" ]; then
-          echo "Installing latest llama-stack-client-python from main branch"
-          export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@main
-        elif [ "${{ inputs.client-version }}" = "published" ]; then
-          echo "Installing published llama-stack-client-python from PyPI"
-          unset LLAMA_STACK_CLIENT_DIR
-        else
-          echo "Invalid client-version: ${{ inputs.client-version }}"
-          exit 1
-        fi
-
+        # Client is already installed by setup-runner (handles both main and release branches)
        echo "Building Llama Stack"

        LLAMA_STACK_DIR=. \
-          uv run --no-sync llama stack build --template ci-tests --image-type venv
+          uv run --no-sync llama stack list-deps ci-tests | xargs -L1 uv pip install

    - name: Configure git for commits
      shell: bash
--- a/.github/actions/setup-typescript-client/action.yml
+++ b/.github/actions/setup-typescript-client/action.yml
@ -0,0 +1,35 @@
+name: Setup TypeScript client
+description: Conditionally checkout and link llama-stack-client-typescript based on client-version
+inputs:
+  client-version:
+    description: 'Client version (latest or published)'
+    required: true
+
+outputs:
+  ts-client-path:
+    description: 'Path or version to use for TypeScript client'
+    value: ${{ steps.set-path.outputs.ts-client-path }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: Checkout TypeScript client (latest)
+      if: ${{ inputs.client-version == 'latest' }}
+      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      with:
+        repository: llamastack/llama-stack-client-typescript
+        ref: main
+        path: .ts-client-checkout
+
+    - name: Set TS_CLIENT_PATH
+      id: set-path
+      shell: bash
+      run: |
+        if [ "${{ inputs.client-version }}" = "latest" ]; then
+          echo "ts-client-path=${{ github.workspace }}/.ts-client-checkout" >> $GITHUB_OUTPUT
+        elif [ "${{ inputs.client-version }}" = "published" ]; then
+          echo "ts-client-path=^0.3.2" >> $GITHUB_OUTPUT
+        else
+          echo "::error::Invalid client-version: ${{ inputs.client-version }}"
+          exit 1
+        fi
--- a/.github/actions/setup-vllm/action.yml
+++ b/.github/actions/setup-vllm/action.yml
@ -11,13 +11,14 @@ runs:
          --name vllm \
          -p 8000:8000 \
          --privileged=true \
-          quay.io/higginsd/vllm-cpu:65393ee064 \
+          quay.io/higginsd/vllm-cpu:65393ee064-qwen3 \
          --host 0.0.0.0 \
          --port 8000 \
          --enable-auto-tool-choice \
-          --tool-call-parser llama3_json \
-          --model /root/.cache/Llama-3.2-1B-Instruct \
-          --served-model-name meta-llama/Llama-3.2-1B-Instruct
+          --tool-call-parser hermes \
+          --model /root/.cache/Qwen3-0.6B \
+          --served-model-name Qwen/Qwen3-0.6B \
+          --max-model-len 8192

          # Wait for vllm to be ready
          echo "Waiting for vllm to be ready..."
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -22,7 +22,7 @@ updates:
      prefix: chore(python-deps)

  - package-ecosystem: npm
-    directory: "/llama_stack/ui"
+    directory: "/llama_stack_ui"
    schedule:
      interval: "weekly"
      day: "saturday"
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -0,0 +1,23 @@
+pull_request_rules:
+- name: ping author on conflicts and add 'needs-rebase' label
+  conditions:
+      - conflict
+      - -closed
+  actions:
+    label:
+      add:
+        - needs-rebase
+    comment:
+      message: >
+       This pull request has merge conflicts that must be resolved before it
+       can be merged. @{{author}} please rebase it.
+       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
+
+- name: remove 'needs-rebase' label when conflict is resolved
+  conditions:
+      - -conflict
+      - -closed
+  actions:
+    label:
+      remove:
+        - needs-rebase
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -4,7 +4,7 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl

 | Name | File | Purpose |
 | ---- | ---- | ------- |
-| Update Changelog | [changelog.yml](changelog.yml) | Creates PR for updating the CHANGELOG.md |
+| Backward Compatibility Check | [backward-compat.yml](backward-compat.yml) | Check backward compatibility for run.yaml configs |
 | API Conformance Tests | [conformance.yml](conformance.yml) | Run the API Conformance test suite on the changes. |
 | Installer CI | [install-script-ci.yml](install-script-ci.yml) | Test the installation script |
 | Integration Auth Tests | [integration-auth-tests.yml](integration-auth-tests.yml) | Run the integration test suite with Kubernetes authentication |
@ -12,11 +12,12 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suites from tests/integration in replay mode |
 | Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers |
 | Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
-| Pre-commit Bot | [precommit-trigger.yml](precommit-trigger.yml) | Pre-commit bot for PR |
 | Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
+| Test llama stack list-deps | [providers-list-deps.yml](providers-list-deps.yml) | Test llama stack list-deps |
 | Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
 | Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration |
 | Check semantic PR titles | [semantic-pr.yml](semantic-pr.yml) | Ensure that PR titles follow the conventional commit spec |
+| Stainless SDK Builds | [stainless-builds.yml](stainless-builds.yml) | Build Stainless SDK from OpenAPI spec changes |
 | Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action |
 | Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module |
 | Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |
--- a/.github/workflows/backward-compat.yml
+++ b/.github/workflows/backward-compat.yml
@ -0,0 +1,578 @@
+name: Backward Compatibility Check
+
+run-name: Check backward compatibility for run.yaml configs
+
+on:
+  pull_request:
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.[0-9]+.[0-9]+'
+      - 'release-[0-9]+.[0-9]+.[0-9]+'
+      - 'release-[0-9]+.[0-9]+'
+    paths:
+      - 'src/llama_stack/core/datatypes.py'
+      - 'src/llama_stack/providers/datatypes.py'
+      - 'src/llama_stack/distributions/**/run.yaml'
+      - 'tests/backward_compat/**'
+      - '.github/workflows/backward-compat.yml'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  check-main-compatibility:
+    name: Check Compatibility with main
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout PR branch
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        with:
+          fetch-depth: 0  # Need full history to access main branch
+
+      - name: Set up Python
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        with:
+          python-version: '3.12'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@1e862dfacbd1d6d858c55d9b792c756523627244 # v7.1.4
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        run: |
+          uv sync --group dev
+
+      - name: Extract run.yaml files from main branch
+        id: extract_configs
+        run: |
+          # Get list of run.yaml paths from main
+          git fetch origin main
+          CONFIG_PATHS=$(git ls-tree -r --name-only origin/main | grep "src/llama_stack/distributions/.*/run.yaml$" || true)
+
+          if [ -z "$CONFIG_PATHS" ]; then
+            echo "No run.yaml files found in main branch"
+            exit 1
+          fi
+
+          # Extract all configs to a temp directory
+          mkdir -p /tmp/main_configs
+          echo "Extracting configs from main branch:"
+
+          while IFS= read -r config_path; do
+            if [ -z "$config_path" ]; then
+              continue
+            fi
+
+            # Extract filename for storage
+            filename=$(basename $(dirname "$config_path"))
+            echo "  - $filename (from $config_path)"
+
+            git show origin/main:"$config_path" > "/tmp/main_configs/${filename}.yaml"
+          done <<< "$CONFIG_PATHS"
+
+          echo ""
+          echo "Extracted $(ls /tmp/main_configs/*.yaml | wc -l) config files"
+
+      - name: Test all configs from main
+        id: test_configs
+        continue-on-error: true
+        run: |
+          # Run pytest once with all configs parameterized
+          if COMPAT_TEST_CONFIGS_DIR=/tmp/main_configs uv run pytest tests/backward_compat/test_run_config.py -v; then
+            echo "failed=false" >> $GITHUB_OUTPUT
+          else
+            echo "failed=true" >> $GITHUB_OUTPUT
+            exit 1
+          fi
+
+      - name: Check for breaking change acknowledgment
+        id: check_ack
+        if: steps.test_configs.outputs.failed == 'true'
+        run: |
+          echo "Breaking changes detected. Checking for acknowledgment..."
+
+          # Check PR title for '!:' marker (conventional commits)
+          PR_TITLE="${{ github.event.pull_request.title }}"
+          if [[ "$PR_TITLE" =~ ^[a-z]+\!: ]]; then
+            echo "✓ Breaking change acknowledged in PR title"
+            echo "acknowledged=true" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          # Check commit messages for BREAKING CHANGE:
+          if git log origin/main..HEAD --format=%B | grep -q "BREAKING CHANGE:"; then
+            echo "✓ Breaking change acknowledged in commit message"
+            echo "acknowledged=true" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          echo "✗ Breaking change NOT acknowledged"
+          echo "acknowledged=false" >> $GITHUB_OUTPUT
+        env:
+          GH_TOKEN: ${{ github.token }}
+
+      - name: Evaluate results
+        if: always()
+        run: |
+          FAILED="${{ steps.test_configs.outputs.failed }}"
+          ACKNOWLEDGED="${{ steps.check_ack.outputs.acknowledged }}"
+
+          if [[ "$FAILED" == "true" ]]; then
+            if [[ "$ACKNOWLEDGED" == "true" ]]; then
+              echo ""
+              echo "⚠️  WARNING: Breaking changes detected but acknowledged"
+              echo ""
+              echo "This PR introduces backward-incompatible changes to run.yaml."
+              echo "The changes have been properly acknowledged."
+              echo ""
+              exit 0  # Pass the check
+            else
+              echo ""
+              echo "❌ ERROR: Breaking changes detected without acknowledgment"
+              echo ""
+              echo "This PR introduces backward-incompatible changes to run.yaml"
+              echo "that will break existing user configurations."
+              echo ""
+              echo "To acknowledge this breaking change, do ONE of:"
+              echo "  1. Add '!:' to your PR title (e.g., 'feat!: change xyz')"
+              echo "  2. Add the 'breaking-change' label to this PR"
+              echo "  3. Include 'BREAKING CHANGE:' in a commit message"
+              echo ""
+              exit 1  # Fail the check
+            fi
+          fi
+
+  test-integration-main:
+    name: Run Integration Tests with main Config
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout PR branch
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        with:
+          fetch-depth: 0
+
+      - name: Extract ci-tests run.yaml from main
+        run: |
+          git fetch origin main
+          git show origin/main:src/llama_stack/distributions/ci-tests/run.yaml > /tmp/main-ci-tests-run.yaml
+          echo "Extracted ci-tests run.yaml from main branch"
+
+      - name: Setup test environment
+        uses: ./.github/actions/setup-test-environment
+        with:
+          python-version: '3.12'
+          client-version: 'latest'
+          setup: 'ollama'
+          suite: 'base'
+          inference-mode: 'replay'
+
+      - name: Run integration tests with main config
+        id: test_integration
+        continue-on-error: true
+        uses: ./.github/actions/run-and-record-tests
+        with:
+          stack-config: /tmp/main-ci-tests-run.yaml
+          setup: 'ollama'
+          inference-mode: 'replay'
+          suite: 'base'
+
+      - name: Check for breaking change acknowledgment
+        id: check_ack
+        if: steps.test_integration.outcome == 'failure'
+        run: |
+          echo "Integration tests failed. Checking for acknowledgment..."
+
+          # Check PR title for '!:' marker (conventional commits)
+          PR_TITLE="${{ github.event.pull_request.title }}"
+          if [[ "$PR_TITLE" =~ ^[a-z]+\!: ]]; then
+            echo "✓ Breaking change acknowledged in PR title"
+            echo "acknowledged=true" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          # Check commit messages for BREAKING CHANGE:
+          if git log origin/main..HEAD --format=%B | grep -q "BREAKING CHANGE:"; then
+            echo "✓ Breaking change acknowledged in commit message"
+            echo "acknowledged=true" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          echo "✗ Breaking change NOT acknowledged"
+          echo "acknowledged=false" >> $GITHUB_OUTPUT
+        env:
+          GH_TOKEN: ${{ github.token }}
+
+      - name: Evaluate integration test results
+        if: always()
+        run: |
+          TEST_FAILED="${{ steps.test_integration.outcome == 'failure' }}"
+          ACKNOWLEDGED="${{ steps.check_ack.outputs.acknowledged }}"
+
+          if [[ "$TEST_FAILED" == "true" ]]; then
+            if [[ "$ACKNOWLEDGED" == "true" ]]; then
+              echo ""
+              echo "⚠️  WARNING: Integration tests failed with main config but acknowledged"
+              echo ""
+              exit 0  # Pass the check
+            else
+              echo ""
+              echo "❌ ERROR: Integration tests failed with main config without acknowledgment"
+              echo ""
+              echo "To acknowledge this breaking change, do ONE of:"
+              echo "  1. Add '!:' to your PR title (e.g., 'feat!: change xyz')"
+              echo "  2. Include 'BREAKING CHANGE:' in a commit message"
+              echo ""
+              exit 1  # Fail the check
+            fi
+          fi
+
+  test-integration-release:
+    name: Run Integration Tests with Latest Release (Informational)
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout PR branch
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        with:
+          fetch-depth: 0
+
+      - name: Get latest release
+        id: get_release
+        run: |
+          # Get the latest release from GitHub
+          LATEST_TAG=$(gh release list --limit 1 --json tagName --jq '.[0].tagName' 2>/dev/null || echo "")
+
+          if [ -z "$LATEST_TAG" ]; then
+            echo "No releases found, skipping release compatibility check"
+            echo "has_release=false" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          echo "Latest release: $LATEST_TAG"
+          echo "has_release=true" >> $GITHUB_OUTPUT
+          echo "tag=$LATEST_TAG" >> $GITHUB_OUTPUT
+        env:
+          GH_TOKEN: ${{ github.token }}
+
+      - name: Extract ci-tests run.yaml from release
+        if: steps.get_release.outputs.has_release == 'true'
+        id: extract_config
+        run: |
+          RELEASE_TAG="${{ steps.get_release.outputs.tag }}"
+
+          # Try with src/ prefix first (newer releases), then without (older releases)
+          if git show "$RELEASE_TAG:src/llama_stack/distributions/ci-tests/run.yaml" > /tmp/release-ci-tests-run.yaml 2>/dev/null; then
+            echo "Extracted ci-tests run.yaml from release $RELEASE_TAG (src/ path)"
+            echo "has_config=true" >> $GITHUB_OUTPUT
+          elif git show "$RELEASE_TAG:llama_stack/distributions/ci-tests/run.yaml" > /tmp/release-ci-tests-run.yaml 2>/dev/null; then
+            echo "Extracted ci-tests run.yaml from release $RELEASE_TAG (old path)"
+            echo "has_config=true" >> $GITHUB_OUTPUT
+          else
+            echo "::warning::ci-tests/run.yaml not found in release $RELEASE_TAG"
+            echo "has_config=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Setup test environment
+        if: steps.get_release.outputs.has_release == 'true' && steps.extract_config.outputs.has_config == 'true'
+        uses: ./.github/actions/setup-test-environment
+        with:
+          python-version: '3.12'
+          client-version: 'latest'
+          setup: 'ollama'
+          suite: 'base'
+          inference-mode: 'replay'
+
+      - name: Run integration tests with release config (PR branch)
+        id: test_release_pr
+        if: steps.get_release.outputs.has_release == 'true' && steps.extract_config.outputs.has_config == 'true'
+        continue-on-error: true
+        uses: ./.github/actions/run-and-record-tests
+        with:
+          stack-config: /tmp/release-ci-tests-run.yaml
+          setup: 'ollama'
+          inference-mode: 'replay'
+          suite: 'base'
+
+      - name: Checkout main branch to test baseline
+        if: steps.get_release.outputs.has_release == 'true' && steps.extract_config.outputs.has_config == 'true'
+        run: |
+          git checkout origin/main
+
+      - name: Setup test environment for main
+        if: steps.get_release.outputs.has_release == 'true' && steps.extract_config.outputs.has_config == 'true'
+        uses: ./.github/actions/setup-test-environment
+        with:
+          python-version: '3.12'
+          client-version: 'latest'
+          setup: 'ollama'
+          suite: 'base'
+          inference-mode: 'replay'
+
+      - name: Run integration tests with release config (main branch)
+        id: test_release_main
+        if: steps.get_release.outputs.has_release == 'true' && steps.extract_config.outputs.has_config == 'true'
+        continue-on-error: true
+        uses: ./.github/actions/run-and-record-tests
+        with:
+          stack-config: /tmp/release-ci-tests-run.yaml
+          setup: 'ollama'
+          inference-mode: 'replay'
+          suite: 'base'
+
+      - name: Report results and post PR comment
+        if: always() && steps.get_release.outputs.has_release == 'true' && steps.extract_config.outputs.has_config == 'true'
+        run: |
+          RELEASE_TAG="${{ steps.get_release.outputs.tag }}"
+          PR_OUTCOME="${{ steps.test_release_pr.outcome }}"
+          MAIN_OUTCOME="${{ steps.test_release_main.outcome }}"
+
+          if [[ "$PR_OUTCOME" == "failure" && "$MAIN_OUTCOME" == "success" ]]; then
+            # NEW breaking change - PR fails but main passes
+            echo "::error::🚨 This PR introduces a NEW breaking change!"
+
+            # Check if we already posted a comment (to avoid spam on every push)
+            EXISTING_COMMENT=$(gh pr view ${{ github.event.pull_request.number }} --json comments --jq '.comments[] | select(.body | contains("🚨 New Breaking Change Detected") and contains("Integration tests")) | .id' | head -1)
+
+            if [[ -z "$EXISTING_COMMENT" ]]; then
+              gh pr comment ${{ github.event.pull_request.number }} --body "## 🚨 New Breaking Change Detected
+
+          **Integration tests against release \`$RELEASE_TAG\` are now failing**
+
+          ⚠️  This PR introduces a breaking change that affects compatibility with the latest release.
+
+          - Users on release \`$RELEASE_TAG\` may not be able to upgrade
+          - Existing configurations may break
+
+          The tests pass on \`main\` but fail with this PR's changes.
+
+          > **Note:** This is informational only and does not block merge.
+          > Consider whether this breaking change is acceptable for users."
+            else
+              echo "Comment already exists, skipping to avoid spam"
+            fi
+
+            cat >> $GITHUB_STEP_SUMMARY <<EOF
+          ## 🚨 NEW Breaking Change Detected
+
+          **Integration tests against release \`$RELEASE_TAG\` FAILED**
+
+          ⚠️  **This PR introduces a NEW breaking change**
+
+          - Tests **PASS** on main branch ✅
+          - Tests **FAIL** on PR branch ❌
+          - Users on release \`$RELEASE_TAG\` may not be able to upgrade
+          - Existing configurations may break
+
+          > **Note:** This is informational only and does not block merge.
+          > Consider whether this breaking change is acceptable for users.
+          EOF
+
+          elif [[ "$PR_OUTCOME" == "failure" ]]; then
+            # Existing breaking change - both PR and main fail
+            echo "::warning::Breaking change already exists in main branch"
+
+            cat >> $GITHUB_STEP_SUMMARY <<EOF
+          ## ⚠️ Release Compatibility Test Failed (Existing Issue)
+
+          **Integration tests against release \`$RELEASE_TAG\` FAILED**
+
+          - Tests **FAIL** on main branch ❌
+          - Tests **FAIL** on PR branch ❌
+          - This breaking change already exists in main (not introduced by this PR)
+
+          > **Note:** This is informational only.
+          EOF
+
+          else
+            # Success - tests pass
+            cat >> $GITHUB_STEP_SUMMARY <<EOF
+          ## ✅ Release Compatibility Test Passed
+
+          Integration tests against release \`$RELEASE_TAG\` passed successfully.
+          This PR maintains compatibility with the latest release.
+          EOF
+          fi
+        env:
+          GH_TOKEN: ${{ github.token }}
+
+  check-schema-release-compatibility:
+    name: Check Schema Compatibility with Latest Release (Informational)
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout PR branch
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        with:
+          python-version: '3.12'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@1e862dfacbd1d6d858c55d9b792c756523627244 # v7.1.4
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        run: |
+          uv sync --group dev
+
+      - name: Get latest release
+        id: get_release
+        run: |
+          # Get the latest release from GitHub
+          LATEST_TAG=$(gh release list --limit 1 --json tagName --jq '.[0].tagName' 2>/dev/null || echo "")
+
+          if [ -z "$LATEST_TAG" ]; then
+            echo "No releases found, skipping release compatibility check"
+            echo "has_release=false" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          echo "Latest release: $LATEST_TAG"
+          echo "has_release=true" >> $GITHUB_OUTPUT
+          echo "tag=$LATEST_TAG" >> $GITHUB_OUTPUT
+        env:
+          GH_TOKEN: ${{ github.token }}
+
+      - name: Extract configs from release
+        if: steps.get_release.outputs.has_release == 'true'
+        id: extract_release_configs
+        run: |
+          RELEASE_TAG="${{ steps.get_release.outputs.tag }}"
+
+          # Get run.yaml files from the release (try both src/ and old path)
+          CONFIG_PATHS=$(git ls-tree -r --name-only "$RELEASE_TAG" | grep "llama_stack/distributions/.*/run.yaml$" || true)
+
+          if [ -z "$CONFIG_PATHS" ]; then
+            echo "::warning::No run.yaml files found in release $RELEASE_TAG"
+            echo "has_configs=false" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          # Extract all configs to a temp directory
+          mkdir -p /tmp/release_configs
+          echo "Extracting configs from release $RELEASE_TAG:"
+
+          while IFS= read -r config_path; do
+            if [ -z "$config_path" ]; then
+              continue
+            fi
+
+            filename=$(basename $(dirname "$config_path"))
+            echo "  - $filename (from $config_path)"
+
+            git show "$RELEASE_TAG:$config_path" > "/tmp/release_configs/${filename}.yaml" 2>/dev/null || true
+          done <<< "$CONFIG_PATHS"
+
+          echo ""
+          echo "Extracted $(ls /tmp/release_configs/*.yaml 2>/dev/null | wc -l) config files"
+          echo "has_configs=true" >> $GITHUB_OUTPUT
+
+      - name: Test against release configs (PR branch)
+        id: test_schema_pr
+        if: steps.get_release.outputs.has_release == 'true' && steps.extract_release_configs.outputs.has_configs == 'true'
+        continue-on-error: true
+        run: |
+          RELEASE_TAG="${{ steps.get_release.outputs.tag }}"
+          COMPAT_TEST_CONFIGS_DIR=/tmp/release_configs uv run pytest tests/backward_compat/test_run_config.py -v --tb=short
+
+      - name: Checkout main branch to test baseline
+        if: steps.get_release.outputs.has_release == 'true' && steps.extract_release_configs.outputs.has_configs == 'true'
+        run: |
+          git checkout origin/main
+
+      - name: Install dependencies for main
+        if: steps.get_release.outputs.has_release == 'true' && steps.extract_release_configs.outputs.has_configs == 'true'
+        run: |
+          uv sync --group dev
+
+      - name: Test against release configs (main branch)
+        id: test_schema_main
+        if: steps.get_release.outputs.has_release == 'true' && steps.extract_release_configs.outputs.has_configs == 'true'
+        continue-on-error: true
+        run: |
+          RELEASE_TAG="${{ steps.get_release.outputs.tag }}"
+          COMPAT_TEST_CONFIGS_DIR=/tmp/release_configs uv run pytest tests/backward_compat/test_run_config.py -v --tb=short
+
+      - name: Report results and post PR comment
+        if: always() && steps.get_release.outputs.has_release == 'true' && steps.extract_release_configs.outputs.has_configs == 'true'
+        run: |
+          RELEASE_TAG="${{ steps.get_release.outputs.tag }}"
+          PR_OUTCOME="${{ steps.test_schema_pr.outcome }}"
+          MAIN_OUTCOME="${{ steps.test_schema_main.outcome }}"
+
+          if [[ "$PR_OUTCOME" == "failure" && "$MAIN_OUTCOME" == "success" ]]; then
+            # NEW breaking change - PR fails but main passes
+            echo "::error::🚨 This PR introduces a NEW schema breaking change!"
+
+            # Check if we already posted a comment (to avoid spam on every push)
+            EXISTING_COMMENT=$(gh pr view ${{ github.event.pull_request.number }} --json comments --jq '.comments[] | select(.body | contains("🚨 New Schema Breaking Change Detected")) | .id' | head -1)
+
+            if [[ -z "$EXISTING_COMMENT" ]]; then
+              gh pr comment ${{ github.event.pull_request.number }} --body "## 🚨 New Schema Breaking Change Detected
+
+          **Schema validation against release \`$RELEASE_TAG\` is now failing**
+
+          ⚠️  This PR introduces a schema breaking change that affects compatibility with the latest release.
+
+          - Users on release \`$RELEASE_TAG\` will not be able to upgrade
+          - Existing run.yaml configurations will fail validation
+
+          The tests pass on \`main\` but fail with this PR's changes.
+
+          > **Note:** This is informational only and does not block merge.
+          > Consider whether this breaking change is acceptable for users."
+            else
+              echo "Comment already exists, skipping to avoid spam"
+            fi
+
+            cat >> $GITHUB_STEP_SUMMARY <<EOF
+          ## 🚨 NEW Schema Breaking Change Detected
+
+          **Schema validation against release \`$RELEASE_TAG\` FAILED**
+
+          ⚠️  **This PR introduces a NEW schema breaking change**
+
+          - Tests **PASS** on main branch ✅
+          - Tests **FAIL** on PR branch ❌
+          - Users on release \`$RELEASE_TAG\` will not be able to upgrade
+          - Existing run.yaml configurations will fail validation
+
+          > **Note:** This is informational only and does not block merge.
+          > Consider whether this breaking change is acceptable for users.
+          EOF
+
+          elif [[ "$PR_OUTCOME" == "failure" ]]; then
+            # Existing breaking change - both PR and main fail
+            echo "::warning::Schema breaking change already exists in main branch"
+
+            cat >> $GITHUB_STEP_SUMMARY <<EOF
+          ## ⚠️ Release Schema Compatibility Failed (Existing Issue)
+
+          **Schema validation against release \`$RELEASE_TAG\` FAILED**
+
+          - Tests **FAIL** on main branch ❌
+          - Tests **FAIL** on PR branch ❌
+          - This schema breaking change already exists in main (not introduced by this PR)
+
+          > **Note:** This is informational only.
+          EOF
+
+          else
+            # Success - tests pass
+            cat >> $GITHUB_STEP_SUMMARY <<EOF
+          ## ✅ Release Schema Compatibility Passed
+
+          All run.yaml configs from release \`$RELEASE_TAG\` are compatible.
+          This PR maintains backward compatibility with the latest release.
+          EOF
+          fi
+        env:
+          GH_TOKEN: ${{ github.token }}
--- a/.github/workflows/changelog.yml
+++ b/.github/workflows/changelog.yml
@ -1,31 +0,0 @@
-name: Update Changelog
-
-run-name: Creates PR for updating the CHANGELOG.md
-
-on:
-  release:
-    types: [published, unpublished, created, edited, deleted, released]
-
-permissions:
-  contents: read
-
-jobs:
-  generate_changelog:
-    name: Generate changelog
-    permissions:
-      contents: write  # for peter-evans/create-pull-request to create branch
-      pull-requests: write  # for peter-evans/create-pull-request to create a PR
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-        with:
-          ref: main
-          fetch-depth: 0
-      - run: |
-          python ./scripts/gen-changelog.py
-      - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
-        with:
-          title: 'docs: update CHANGELOG.md for ${{ github.ref_name }}'
-          commit-message: 'docs: update CHANGELOG.md for ${{ github.ref_name }}'
-          branch: create-pull-request/changelog
-          signoff: true
--- a/.github/workflows/conformance.yml
+++ b/.github/workflows/conformance.yml
@ -22,7 +22,6 @@ on:
      - 'docs/static/stable-llama-stack-spec.yaml'       # Stable APIs spec
      - 'docs/static/experimental-llama-stack-spec.yaml' # Experimental APIs spec
      - 'docs/static/deprecated-llama-stack-spec.yaml'   # Deprecated APIs spec
-      - 'docs/static/llama-stack-spec.html'              # Legacy HTML spec
      - '.github/workflows/conformance.yml'              # This workflow itself

 concurrency:
@ -36,16 +35,16 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout PR Code
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
        with:
          fetch-depth: 0

      # Check if we should skip conformance testing due to breaking changes
      - name: Check if conformance test should be skipped
        id: skip-check
+        env:
+          PR_TITLE: ${{ github.event.pull_request.title }}
        run: |
-          PR_TITLE="${{ github.event.pull_request.title }}"
-
          # Skip if title contains "!:" indicating breaking change (like "feat!:")
          if [[ "$PR_TITLE" == *"!:"* ]]; then
            echo "skip=true" >> $GITHUB_OUTPUT
@ -60,7 +59,7 @@ jobs:
      # This allows us to diff the current changes against the previous state
      - name: Checkout Base Branch
        if: steps.skip-check.outputs.skip != 'true'
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
        with:
          ref: ${{ github.event.pull_request.base.ref }}
          path: 'base'
--- a/.github/workflows/install-script-ci.yml
+++ b/.github/workflows/install-script-ci.yml
@ -16,22 +16,31 @@ jobs:
  lint:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # 5.0.0
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # 6.0.0
      - name: Run ShellCheck on install.sh
        run: shellcheck scripts/install.sh
  smoke-test-on-dev:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0

      - name: Install dependencies
        uses: ./.github/actions/setup-runner

      - name: Build a single provider
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync \
-            llama stack build --template starter --image-type container --image-name test
+          BUILD_ARGS="--build-arg INSTALL_MODE=editable --build-arg DISTRO_NAME=starter"
+          if [ -n "${UV_EXTRA_INDEX_URL:-}" ]; then
+            BUILD_ARGS="$BUILD_ARGS --build-arg UV_EXTRA_INDEX_URL=$UV_EXTRA_INDEX_URL"
+          fi
+          if [ -n "${UV_INDEX_STRATEGY:-}" ]; then
+            BUILD_ARGS="$BUILD_ARGS --build-arg UV_INDEX_STRATEGY=$UV_INDEX_STRATEGY"
+          fi
+          docker build . \
+            -f containers/Containerfile \
+            $BUILD_ARGS \
+            --tag llama-stack:starter-ci

      - name: Run installer end-to-end
        run: |
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -4,13 +4,17 @@ run-name: Run the integration test suite with Kubernetes authentication

 on:
  push:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
  pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
    paths:
      - 'distributions/**'
-      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
+      - 'src/llama_stack/**'
+      - '!src/llama_stack_ui/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -31,7 +35,7 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -73,6 +77,27 @@ jobs:
          image_name: kube
          apis: []
          providers: {}
+          storage:
+            backends:
+              kv_default:
+                type: kv_sqlite
+                db_path: $run_dir/kvstore.db
+              sql_default:
+                type: sql_sqlite
+                db_path: $run_dir/sql_store.db
+            stores:
+              metadata:
+                namespace: registry
+                backend: kv_default
+              inference:
+                table_name: inference_store
+                backend: sql_default
+              conversations:
+                table_name: openai_conversations
+                backend: sql_default
+              prompts:
+                namespace: prompts
+                backend: kv_default
          server:
            port: 8321
          EOF
@ -92,7 +117,8 @@ jobs:
        run: |
          echo "Waiting for Llama Stack server..."
          for i in {1..30}; do
-            if curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://localhost:8321/v1/health | grep -q "OK"; then
+            # Note: /v1/health does not require authentication
+            if curl -s -L http://localhost:8321/v1/health | grep -q "OK"; then
              echo "Llama Stack server is up!"
              if grep -q "Enabling authentication with provider: ${{ matrix.auth-provider }}" server.log; then
                echo "Llama Stack server is configured to use ${{ matrix.auth-provider }} auth"
@ -111,4 +137,27 @@ jobs:

      - name: Test auth
        run: |
-          curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers|jq
+          echo "Testing /v1/version without token (should succeed)..."
+          if curl -s -L -o /dev/null -w "%{http_code}" http://127.0.0.1:8321/v1/version | grep -q "200"; then
+            echo "/v1/version accessible without token (200)"
+          else
+            echo "/v1/version returned non-200 status without token"
+            exit 1
+          fi
+
+          echo "Testing /v1/providers without token (should fail with 401)..."
+          if curl -s -L -o /dev/null -w "%{http_code}" http://127.0.0.1:8321/v1/providers | grep -q "401"; then
+            echo "/v1/providers blocked without token (401)"
+          else
+            echo "/v1/providers did not return 401 without token"
+            exit 1
+          fi
+
+          echo "Testing /v1/providers with valid token (should succeed)..."
+          curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers | jq
+          if [ $? -eq 0 ]; then
+            echo "/v1/providers accessible with valid token"
+          else
+            echo "/v1/providers failed with valid token"
+            exit 1
+          fi
--- a/.github/workflows/integration-sql-store-tests.yml
+++ b/.github/workflows/integration-sql-store-tests.yml
@ -4,11 +4,15 @@ run-name: Run the integration test suite with SqlStore

 on:
  push:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
  pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
    paths:
-      - 'llama_stack/providers/utils/sqlstore/**'
+      - 'src/llama_stack/providers/utils/sqlstore/**'
      - 'tests/integration/sqlstore/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -44,7 +48,7 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -64,7 +68,7 @@ jobs:

      - name: Upload test logs
        if: ${{ always() }}
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
        with:
          name: postgres-test-logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.python-version }}
          path: |
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -4,13 +4,17 @@ run-name: Run the integration test suites from tests/integration in replay mode

 on:
  push:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
  pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
    types: [opened, synchronize, reopened]
    paths:
-      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
+      - 'src/llama_stack/**'
+      - '!src/llama_stack_ui/**'
      - 'tests/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -18,10 +22,11 @@ on:
      - '.github/actions/setup-ollama/action.yml'
      - '.github/actions/setup-test-environment/action.yml'
      - '.github/actions/run-and-record-tests/action.yml'
+      - 'scripts/integration-tests.sh'
+      - 'scripts/generate_ci_matrix.py'
  schedule:
    # If changing the cron schedule, update the provider in the test-matrix job
    - cron: '0 0 * * *'  # (test latest client) Daily at 12 AM UTC
-    - cron: '1 0 * * 0'  # (test vllm) Weekly on Sunday at 1 AM UTC
  workflow_dispatch:
    inputs:
      test-all-client-versions:
@ -39,36 +44,47 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  generate-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+
+      - name: Generate test matrix
+        id: set-matrix
+        run: |
+          # Generate matrix from CI_MATRIX in tests/integration/suites.py
+          # Supports schedule-based and manual input overrides
+          MATRIX=$(PYTHONPATH=. python3 scripts/generate_ci_matrix.py \
+            --schedule "${{ github.event.schedule }}" \
+            --test-setup "${{ github.event.inputs.test-setup }}")
+          echo "matrix=$MATRIX" >> $GITHUB_OUTPUT
+          echo "Generated matrix: $MATRIX"

  run-replay-mode-tests:
+    needs: generate-matrix
    runs-on: ubuntu-latest
-    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.config.setup, matrix.python-version, matrix.client-version, matrix.config.suite) }}
+    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client, matrix.config.setup, matrix.python-version, matrix.client-version, matrix.config.suite) }}

    strategy:
      fail-fast: false
      matrix:
-        client-type: [library, server]
+        client: [library, docker, server]
        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
-        # Define (setup, suite) pairs - they are always matched and cannot be independent
-        # Weekly schedule (Sun 1 AM): vllm+base
-        # Input test-setup=ollama-vision: ollama-vision+vision
-        # Default (including test-setup=ollama): both ollama+base and ollama-vision+vision
-        config: >-
-          ${{
-            github.event.schedule == '1 0 * * 0'
-              && fromJSON('[{"setup": "vllm", "suite": "base"}]')
-            || github.event.inputs.test-setup == 'ollama-vision'
-              && fromJSON('[{"setup": "ollama-vision", "suite": "vision"}]')
-            || fromJSON('[{"setup": "ollama", "suite": "base"}, {"setup": "ollama-vision", "suite": "vision"}]')
-          }}
+        # Test configurations: Generated from CI_MATRIX in tests/integration/suites.py
+        # See scripts/generate_ci_matrix.py for generation logic
+        config: ${{ fromJSON(needs.generate-matrix.outputs.matrix).include }}

    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0

      - name: Setup test environment
+        if: ${{ matrix.config.allowed_clients == null || contains(matrix.config.allowed_clients, matrix.client) }}
        uses: ./.github/actions/setup-test-environment
        with:
          python-version: ${{ matrix.python-version }}
@ -77,10 +93,33 @@ jobs:
          suite: ${{ matrix.config.suite }}
          inference-mode: 'replay'

-      - name: Run tests
-        uses: ./.github/actions/run-and-record-tests
+      - name: Setup Node.js for TypeScript client tests
+        if: ${{ matrix.client == 'server' }}
+        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
        with:
-          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
+          node-version: '20'
+          cache: 'npm'
+          cache-dependency-path: tests/integration/client-typescript/package-lock.json
+
+      - name: Setup TypeScript client
+        if: ${{ matrix.client == 'server' }}
+        id: setup-ts-client
+        uses: ./.github/actions/setup-typescript-client
+        with:
+          client-version: ${{ matrix.client-version }}
+
+      - name: Run tests
+        if: ${{ matrix.config.allowed_clients == null || contains(matrix.config.allowed_clients, matrix.client) }}
+        uses: ./.github/actions/run-and-record-tests
+        env:
+          OPENAI_API_KEY: dummy
+          TS_CLIENT_PATH: ${{ steps.setup-ts-client.outputs.ts-client-path || '' }}
+        with:
+          stack-config: >-
+            ${{ matrix.config.stack_config
+                || (matrix.client == 'library' && 'ci-tests')
+                || (matrix.client == 'server' && 'server:ci-tests')
+                || 'docker:ci-tests' }}
          setup: ${{ matrix.config.setup }}
          inference-mode: 'replay'
          suite: ${{ matrix.config.suite }}
--- a/.github/workflows/integration-vector-io-tests.yml
+++ b/.github/workflows/integration-vector-io-tests.yml
@ -4,12 +4,16 @@ run-name: Run the integration test suite with various VectorIO providers

 on:
  push:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
  pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
    paths:
-      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
+      - 'src/llama_stack/**'
+      - '!src/llama_stack_ui/**'
      - 'tests/integration/vector_io/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -33,7 +37,7 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -144,7 +148,7 @@ jobs:

      - name: Build Llama Stack
        run: |
-          uv run --no-sync llama stack build --template ci-tests --image-type venv
+          uv run --no-sync llama stack list-deps ci-tests | xargs -L1 uv pip install

      - name: Check Storage and Memory Available Before Tests
        if: ${{ always() }}
@ -169,8 +173,7 @@ jobs:
        run: |
          uv run --no-sync \
            pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
-            tests/integration/vector_io \
-            --embedding-model inline::sentence-transformers/all-MiniLM-L6-v2
+            tests/integration/vector_io

      - name: Check Storage and Memory Available After Tests
        if: ${{ always() }}
@ -195,7 +198,7 @@ jobs:

      - name: Upload all logs to artifacts
        if: ${{ always() }}
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
        with:
          name: vector-io-logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ env.SANITIZED_PROVIDER }}-${{ matrix.python-version }}
          path: |
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -5,7 +5,9 @@ run-name: Run pre-commit checks
 on:
  pull_request:
  push:
-    branches: [main]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
@ -20,7 +22,7 @@ jobs:

    steps:
      - name: Checkout code
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
        with:
          # For dependabot PRs, we need to checkout with a token that can push changes
          token: ${{ github.actor == 'dependabot[bot]' && secrets.GITHUB_TOKEN || github.token }}
@ -37,29 +39,47 @@ jobs:
            .pre-commit-config.yaml

      - name: Set up Node.js
-        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
+        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
        with:
          node-version: '20'
          cache: 'npm'
-          cache-dependency-path: 'llama_stack/ui/'
+          cache-dependency-path: 'src/llama_stack_ui/'
+
+      - name: Set up uv
+        uses: astral-sh/setup-uv@1e862dfacbd1d6d858c55d9b792c756523627244 # v7.1.4

      - name: Install npm dependencies
        run: npm ci
-        working-directory: llama_stack/ui
+        working-directory: src/llama_stack_ui
+
+      - name: Install pre-commit
+        run: python -m pip install 'pre-commit>=4.4.0'
+
+      - name: Cache pre-commit
+        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4
+        with:
+          path: ~/.cache/pre-commit
+          key: pre-commit-3|${{ env.pythonLocation }}|${{ hashFiles('.pre-commit-config.yaml') }}

      - name: Run pre-commit
        id: precommit
-        uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
-        continue-on-error: true
+        run: |
+          set +e
+          pre-commit run --show-diff-on-failure --color=always --all-files 2>&1 | tee /tmp/precommit.log
+          status=${PIPESTATUS[0]}
+          echo "status=$status" >> $GITHUB_OUTPUT
+          exit 0
        env:
-          SKIP: no-commit-to-branch
+          SKIP: no-commit-to-branch,mypy
          RUFF_OUTPUT_FORMAT: github

      - name: Check pre-commit results
-        if: steps.precommit.outcome == 'failure'
+        if: steps.precommit.outputs.status != '0'
        run: |
          echo "::error::Pre-commit hooks failed. Please run 'pre-commit run --all-files' locally and commit the fixes."
-          echo "::warning::Some pre-commit hooks failed. Check the output above for details."
+          echo ""
+          echo "Failed hooks output:"
+          cat /tmp/precommit.log
          exit 1

      - name: Debug
@ -109,3 +129,50 @@ jobs:
            echo "$unstaged_files"
            exit 1
          fi
+
+      - name: Configure client installation
+        id: client-config
+        uses: ./.github/actions/install-llama-stack-client
+
+      - name: Sync dev + type_checking dependencies
+        env:
+          UV_EXTRA_INDEX_URL: ${{ steps.client-config.outputs.uv-extra-index-url }}
+        run: |
+          if [ -n "$UV_EXTRA_INDEX_URL" ]; then
+            export UV_INDEX_STRATEGY="unsafe-best-match"
+          fi
+
+          uv sync --group dev --group type_checking
+
+          # Install specific client version after sync if needed
+          if [ "${{ steps.client-config.outputs.install-after-sync }}" = "true" ]; then
+            echo "Installing llama-stack-client from: ${{ steps.client-config.outputs.install-source }}"
+            uv pip install ${{ steps.client-config.outputs.install-source }}
+          fi
+
+      - name: Run mypy (full type_checking)
+        env:
+          UV_EXTRA_INDEX_URL: ${{ steps.client-config.outputs.uv-extra-index-url }}
+        run: |
+          if [ -n "$UV_EXTRA_INDEX_URL" ]; then
+            export UV_INDEX_STRATEGY="unsafe-best-match"
+          fi
+
+          set +e
+          uv run --group dev --group type_checking mypy
+          status=$?
+          if [ $status -ne 0 ]; then
+            echo "::error::Full mypy failed. Reproduce locally with 'uv run pre-commit run mypy-full --hook-stage manual --all-files'."
+          fi
+          exit $status
+
+      - name: Check if any unused recordings
+        run: |
+          set -e
+          PYTHONPATH=$PWD uv run ./scripts/cleanup_recordings.py --delete
+          changes=$(git status --short tests/integration | grep 'recordings' || true)
+          if [ -n "$changes" ]; then
+            echo "::error::Unused integration recordings detected. Run 'PYTHONPATH=$(pwd) uv run ./scripts/cleanup_recordings.py --delete' locally and commit the deletions."
+            echo "$changes"
+            exit 1
+          fi
--- a/.github/workflows/precommit-trigger.yml
+++ b/.github/workflows/precommit-trigger.yml
@ -1,227 +0,0 @@
-name: Pre-commit Bot
-
-run-name: Pre-commit bot for PR #${{ github.event.issue.number }}
-
-on:
-  issue_comment:
-    types: [created]
-
-jobs:
-  pre-commit:
-    # Only run on pull request comments
-    if: github.event.issue.pull_request && contains(github.event.comment.body, '@github-actions run precommit')
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-      pull-requests: write
-
-    steps:
-      - name: Check comment author and get PR details
-        id: check_author
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            // Get PR details
-            const pr = await github.rest.pulls.get({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              pull_number: context.issue.number
-            });
-
-            // Check if commenter has write access or is the PR author
-            const commenter = context.payload.comment.user.login;
-            const prAuthor = pr.data.user.login;
-
-            let hasPermission = false;
-
-            // Check if commenter is PR author
-            if (commenter === prAuthor) {
-              hasPermission = true;
-              console.log(`Comment author ${commenter} is the PR author`);
-            } else {
-              // Check if commenter has write/admin access
-              try {
-                const permission = await github.rest.repos.getCollaboratorPermissionLevel({
-                  owner: context.repo.owner,
-                  repo: context.repo.repo,
-                  username: commenter
-                });
-
-                const level = permission.data.permission;
-                hasPermission = ['write', 'admin', 'maintain'].includes(level);
-                console.log(`Comment author ${commenter} has permission: ${level}`);
-              } catch (error) {
-                console.log(`Could not check permissions for ${commenter}: ${error.message}`);
-              }
-            }
-
-            if (!hasPermission) {
-              await github.rest.issues.createComment({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                issue_number: context.issue.number,
-                body: `❌ @${commenter} You don't have permission to trigger pre-commit. Only PR authors or repository collaborators can run this command.`
-              });
-              core.setFailed(`User ${commenter} does not have permission`);
-              return;
-            }
-
-            // Save PR info for later steps
-            core.setOutput('pr_number', context.issue.number);
-            core.setOutput('pr_head_ref', pr.data.head.ref);
-            core.setOutput('pr_head_sha', pr.data.head.sha);
-            core.setOutput('pr_head_repo', pr.data.head.repo.full_name);
-            core.setOutput('pr_base_ref', pr.data.base.ref);
-            core.setOutput('is_fork', pr.data.head.repo.full_name !== context.payload.repository.full_name);
-            core.setOutput('authorized', 'true');
-
-      - name: React to comment
-        if: steps.check_author.outputs.authorized == 'true'
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            await github.rest.reactions.createForIssueComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              comment_id: context.payload.comment.id,
-              content: 'rocket'
-            });
-
-      - name: Comment starting
-        if: steps.check_author.outputs.authorized == 'true'
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            await github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: ${{ steps.check_author.outputs.pr_number }},
-              body: `⏳ Running pre-commit hooks on PR #${{ steps.check_author.outputs.pr_number }}...`
-            });
-
-      - name: Checkout PR branch (same-repo)
-        if: steps.check_author.outputs.authorized == 'true' && steps.check_author.outputs.is_fork == 'false'
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-        with:
-          ref: ${{ steps.check_author.outputs.pr_head_ref }}
-          fetch-depth: 0
-          token: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Checkout PR branch (fork)
-        if: steps.check_author.outputs.authorized == 'true' && steps.check_author.outputs.is_fork == 'true'
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-        with:
-          repository: ${{ steps.check_author.outputs.pr_head_repo }}
-          ref: ${{ steps.check_author.outputs.pr_head_ref }}
-          fetch-depth: 0
-          token: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Verify checkout
-        if: steps.check_author.outputs.authorized == 'true'
-        run: |
-          echo "Current SHA: $(git rev-parse HEAD)"
-          echo "Expected SHA: ${{ steps.check_author.outputs.pr_head_sha }}"
-          if [[ "$(git rev-parse HEAD)" != "${{ steps.check_author.outputs.pr_head_sha }}" ]]; then
-            echo "::error::Checked out SHA does not match expected SHA"
-            exit 1
-          fi
-
-      - name: Set up Python
-        if: steps.check_author.outputs.authorized == 'true'
-        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
-        with:
-          python-version: '3.12'
-          cache: pip
-          cache-dependency-path: |
-            **/requirements*.txt
-            .pre-commit-config.yaml
-
-      - name: Set up Node.js
-        if: steps.check_author.outputs.authorized == 'true'
-        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
-        with:
-          node-version: '20'
-          cache: 'npm'
-          cache-dependency-path: 'llama_stack/ui/'
-
-      - name: Install npm dependencies
-        if: steps.check_author.outputs.authorized == 'true'
-        run: npm ci
-        working-directory: llama_stack/ui
-
-      - name: Run pre-commit
-        if: steps.check_author.outputs.authorized == 'true'
-        id: precommit
-        uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
-        continue-on-error: true
-        env:
-          SKIP: no-commit-to-branch
-          RUFF_OUTPUT_FORMAT: github
-
-      - name: Check for changes
-        if: steps.check_author.outputs.authorized == 'true'
-        id: changes
-        run: |
-          if ! git diff --exit-code || [ -n "$(git ls-files --others --exclude-standard)" ]; then
-            echo "has_changes=true" >> $GITHUB_OUTPUT
-            echo "Changes detected after pre-commit"
-          else
-            echo "has_changes=false" >> $GITHUB_OUTPUT
-            echo "No changes after pre-commit"
-          fi
-
-      - name: Commit and push changes
-        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'true'
-        run: |
-          git config --local user.email "github-actions[bot]@users.noreply.github.com"
-          git config --local user.name "github-actions[bot]"
-
-          git add -A
-          git commit -m "style: apply pre-commit fixes
-
-          🤖 Applied by @github-actions bot via pre-commit workflow"
-
-          # Push changes
-          git push origin HEAD:${{ steps.check_author.outputs.pr_head_ref }}
-
-      - name: Comment success with changes
-        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'true'
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            await github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: ${{ steps.check_author.outputs.pr_number }},
-              body: `✅ Pre-commit hooks completed successfully!\n\n🔧 Changes have been committed and pushed to the PR branch.`
-            });
-
-      - name: Comment success without changes
-        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'false' && steps.precommit.outcome == 'success'
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            await github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: ${{ steps.check_author.outputs.pr_number }},
-              body: `✅ Pre-commit hooks passed!\n\n✨ No changes needed - your code is already formatted correctly.`
-            });
-
-      - name: Comment failure
-        if: failure()
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            await github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: ${{ steps.check_author.outputs.pr_number }},
-              body: `❌ Pre-commit workflow failed!\n\nPlease check the [workflow logs](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}) for details.`
-            });
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -7,23 +7,27 @@ on:
    branches:
      - main
    paths:
-      - 'llama_stack/cli/stack/build.py'
-      - 'llama_stack/cli/stack/_build.py'
-      - 'llama_stack/core/build.*'
-      - 'llama_stack/core/*.sh'
+      - 'src/llama_stack/cli/stack/build.py'
+      - 'src/llama_stack/cli/stack/_build.py'
+      - 'src/llama_stack/core/build.*'
+      - 'src/llama_stack/core/*.sh'
      - '.github/workflows/providers-build.yml'
-      - 'llama_stack/distributions/**'
+      - 'src/llama_stack/distributions/**'
      - 'pyproject.toml'
+      - 'containers/Containerfile'
+      - '.dockerignore'

  pull_request:
    paths:
-      - 'llama_stack/cli/stack/build.py'
-      - 'llama_stack/cli/stack/_build.py'
-      - 'llama_stack/core/build.*'
-      - 'llama_stack/core/*.sh'
+      - 'src/llama_stack/cli/stack/build.py'
+      - 'src/llama_stack/cli/stack/_build.py'
+      - 'src/llama_stack/core/build.*'
+      - 'src/llama_stack/core/*.sh'
      - '.github/workflows/providers-build.yml'
-      - 'llama_stack/distributions/**'
+      - 'src/llama_stack/distributions/**'
      - 'pyproject.toml'
+      - 'containers/Containerfile'
+      - '.dockerignore'

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
@ -36,12 +40,12 @@ jobs:
      distros: ${{ steps.set-matrix.outputs.distros }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0

      - name: Generate Distribution List
        id: set-matrix
        run: |
-          distros=$(ls llama_stack/distributions/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
+          distros=$(ls src/llama_stack/distributions/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
          echo "distros=$distros" >> "$GITHUB_OUTPUT"

  build:
@ -55,20 +59,30 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0

      - name: Install dependencies
        uses: ./.github/actions/setup-runner

-      - name: Print build dependencies
+      - name: Install distribution into venv
+        if: matrix.image-type == 'venv'
        run: |
-          uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
+          uv run llama stack list-deps ${{ matrix.distro }} | xargs -L1 uv pip install

-      - name: Run Llama Stack Build
+      - name: Build container image
+        if: matrix.image-type == 'container'
        run: |
-          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
-          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test
+          BUILD_ARGS="--build-arg INSTALL_MODE=editable --build-arg DISTRO_NAME=${{ matrix.distro }}"
+          if [ -n "${UV_EXTRA_INDEX_URL:-}" ]; then
+            BUILD_ARGS="$BUILD_ARGS --build-arg UV_EXTRA_INDEX_URL=$UV_EXTRA_INDEX_URL"
+          fi
+          if [ -n "${UV_INDEX_STRATEGY:-}" ]; then
+            BUILD_ARGS="$BUILD_ARGS --build-arg UV_INDEX_STRATEGY=$UV_INDEX_STRATEGY"
+          fi
+          docker build . \
+            -f containers/Containerfile \
+            $BUILD_ARGS \
+            --tag llama-stack:${{ matrix.distro }}-ci

      - name: Print dependencies in the image
        if: matrix.image-type == 'venv'
@ -79,29 +93,40 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0

      - name: Install dependencies
        uses: ./.github/actions/setup-runner

      - name: Build a single provider
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --image-type venv --image-name test --providers inference=remote::ollama
-
+          uv pip install -e .
+          uv run --no-sync llama stack list-deps --providers inference=remote::ollama | xargs -L1 uv pip install
  build-custom-container-distribution:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0

      - name: Install dependencies
        uses: ./.github/actions/setup-runner

-      - name: Build a single provider
+      - name: Build container image
        run: |
-          yq -i '.image_type = "container"' llama_stack/distributions/ci-tests/build.yaml
-          yq -i '.image_name = "test"' llama_stack/distributions/ci-tests/build.yaml
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml
+          BASE_IMAGE=$(yq -r '.distribution_spec.container_image // "python:3.12-slim"' src/llama_stack/distributions/ci-tests/build.yaml)
+          BUILD_ARGS="--build-arg INSTALL_MODE=editable --build-arg DISTRO_NAME=ci-tests"
+          BUILD_ARGS="$BUILD_ARGS --build-arg BASE_IMAGE=$BASE_IMAGE"
+          BUILD_ARGS="$BUILD_ARGS --build-arg RUN_CONFIG_PATH=/workspace/src/llama_stack/distributions/ci-tests/run.yaml"
+          if [ -n "${UV_EXTRA_INDEX_URL:-}" ]; then
+            BUILD_ARGS="$BUILD_ARGS --build-arg UV_EXTRA_INDEX_URL=$UV_EXTRA_INDEX_URL"
+          fi
+          if [ -n "${UV_INDEX_STRATEGY:-}" ]; then
+            BUILD_ARGS="$BUILD_ARGS --build-arg UV_INDEX_STRATEGY=$UV_INDEX_STRATEGY"
+          fi
+          docker build . \
+            -f containers/Containerfile \
+            $BUILD_ARGS \
+            -t llama-stack:ci-tests

      - name: Inspect the container image entrypoint
        run: |
@ -112,7 +137,7 @@ jobs:
          fi
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[llama stack run /app/run.yaml]" ]; then
+          if [ "$entrypoint" != "[/usr/local/bin/llama-stack-entrypoint.sh]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
@ -121,7 +146,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -129,17 +154,25 @@ jobs:
      - name: Pin distribution to UBI9 base
        run: |
          yq -i '
-            .image_type    = "container" |
-            .image_name    = "ubi9-test" |
            .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
-          ' llama_stack/distributions/ci-tests/build.yaml
+          ' src/llama_stack/distributions/ci-tests/build.yaml

-      - name: Build dev container (UBI9)
-        env:
-          USE_COPY_NOT_MOUNT: "true"
-          LLAMA_STACK_DIR: "."
+      - name: Build UBI9 container image
        run: |
-          uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml
+          BASE_IMAGE=$(yq -r '.distribution_spec.container_image // "registry.access.redhat.com/ubi9:latest"' src/llama_stack/distributions/ci-tests/build.yaml)
+          BUILD_ARGS="--build-arg INSTALL_MODE=editable --build-arg DISTRO_NAME=ci-tests"
+          BUILD_ARGS="$BUILD_ARGS --build-arg BASE_IMAGE=$BASE_IMAGE"
+          BUILD_ARGS="$BUILD_ARGS --build-arg RUN_CONFIG_PATH=/workspace/src/llama_stack/distributions/ci-tests/run.yaml"
+          if [ -n "${UV_EXTRA_INDEX_URL:-}" ]; then
+            BUILD_ARGS="$BUILD_ARGS --build-arg UV_EXTRA_INDEX_URL=$UV_EXTRA_INDEX_URL"
+          fi
+          if [ -n "${UV_INDEX_STRATEGY:-}" ]; then
+            BUILD_ARGS="$BUILD_ARGS --build-arg UV_INDEX_STRATEGY=$UV_INDEX_STRATEGY"
+          fi
+          docker build . \
+            -f containers/Containerfile \
+            $BUILD_ARGS \
+            -t llama-stack:ci-tests-ubi9

      - name: Inspect UBI9 image
        run: |
@ -150,7 +183,7 @@ jobs:
          fi
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[llama stack run /app/run.yaml]" ]; then
+          if [ "$entrypoint" != "[/usr/local/bin/llama-stack-entrypoint.sh]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
--- a/.github/workflows/providers-list-deps.yml
+++ b/.github/workflows/providers-list-deps.yml
@ -0,0 +1,105 @@
+name: Test llama stack list-deps
+
+run-name: Test llama stack list-deps
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'src/llama_stack/cli/stack/list_deps.py'
+      - 'src/llama_stack/cli/stack/_list_deps.py'
+      - 'src/llama_stack/core/build.*'
+      - 'src/llama_stack/core/*.sh'
+      - '.github/workflows/providers-list-deps.yml'
+      - 'src/llama_stack/templates/**'
+      - 'pyproject.toml'
+
+  pull_request:
+    paths:
+      - 'src/llama_stack/cli/stack/list_deps.py'
+      - 'src/llama_stack/cli/stack/_list_deps.py'
+      - 'src/llama_stack/core/build.*'
+      - 'src/llama_stack/core/*.sh'
+      - '.github/workflows/providers-list-deps.yml'
+      - 'src/llama_stack/templates/**'
+      - 'pyproject.toml'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  generate-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      distros: ${{ steps.set-matrix.outputs.distros }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+
+      - name: Generate Distribution List
+        id: set-matrix
+        run: |
+          distros=$(ls src/llama_stack/distributions/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
+          echo "distros=$distros" >> "$GITHUB_OUTPUT"
+
+  list-deps:
+    needs: generate-matrix
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        distro: ${{ fromJson(needs.generate-matrix.outputs.distros) }}
+        image-type: [venv, container]
+      fail-fast: false # We want to run all jobs even if some fail
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
+
+      - name: Print dependencies
+        run: |
+          uv run llama stack list-deps ${{ matrix.distro }}
+
+      - name: Install Distro using llama stack list-deps
+        run: |
+          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
+          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack list-deps ${{ matrix.distro }} | xargs -L1 uv pip install
+
+      - name: Print dependencies in the image
+        if: matrix.image-type == 'venv'
+        run: |
+          uv pip list
+
+  show-single-provider:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
+
+      - name: Show a single provider
+        run: |
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack list-deps --providers inference=remote::ollama
+
+  list-deps-from-config:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
+
+      - name: list-des from Config
+        env:
+          USE_COPY_NOT_MOUNT: "true"
+          LLAMA_STACK_DIR: "."
+        run: |
+          uv run llama stack list-deps src/llama_stack/distributions/ci-tests/build.yaml
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -10,7 +10,7 @@ on:
    branches:
      - main
    paths-ignore:
-        - 'llama_stack/ui/**'
+        - 'src/llama_stack_ui/**'

 jobs:
  build:
@ -21,29 +21,31 @@ jobs:

    steps:
    - name: Checkout repository
-      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0

    - name: Install uv
-      uses: astral-sh/setup-uv@d0cc045d04ccac9d8b7881df0226f9e82c39688e # v6.8.0
+      uses: astral-sh/setup-uv@1e862dfacbd1d6d858c55d9b792c756523627244 # v7.1.4
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
        version: 0.7.6

-    - name: Build Llama Stack package
-      run: |
-        uv build
+    - name: Build Llama Stack API package
+      working-directory: src/llama_stack_api
+      run: uv build

-    - name: Install Llama Stack package
+    - name: Build Llama Stack package
+      run: uv build
+
+    - name: Install Llama Stack package (with api stubs from local build)
      run: |
-        uv pip install dist/*.whl
+        uv pip install --find-links src/llama_stack_api/dist dist/*.whl

    - name: Verify Llama Stack package
      run: |
        uv pip list
        uv pip show llama-stack
        command -v llama
-        llama model prompt-format -m Llama3.2-90B-Vision-Instruct
-        llama model list
        llama stack list-apis
        llama stack list-providers inference
+        llama stack list-deps starter
--- a/.github/workflows/record-integration-tests.yml
+++ b/.github/workflows/record-integration-tests.yml
@ -46,7 +46,7 @@ jobs:
          echo "::endgroup::"

      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
        with:
          fetch-depth: 0

@ -61,6 +61,9 @@ jobs:

      - name: Run and record tests
        uses: ./.github/actions/run-and-record-tests
+        env:
+          # Set OPENAI_API_KEY if using gpt setup
+          OPENAI_API_KEY: ${{ inputs.test-setup == 'gpt' && secrets.OPENAI_API_KEY || '' }}
        with:
          stack-config: 'server:ci-tests'  # recording must be done with server since more tests are run
          setup: ${{ inputs.test-setup || 'ollama' }}
--- a/.github/workflows/stainless-builds.yml
+++ b/.github/workflows/stainless-builds.yml
@ -0,0 +1,146 @@
+name: Stainless SDK Builds
+run-name: Build Stainless SDK from OpenAPI spec changes
+
+# This workflow uses pull_request_target, which allows it to run on pull requests
+# from forks with access to secrets. This is safe because the workflow definition
+# comes from the base branch (trusted), and the action only reads OpenAPI spec
+# files without executing any code from the PR.
+
+on:
+  pull_request_target:
+    types:
+      - opened
+      - synchronize
+      - reopened
+      - closed
+    paths:
+      - "client-sdks/stainless/**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+  cancel-in-progress: true
+
+env:
+  # Stainless organization name.
+  STAINLESS_ORG: llamastack
+
+  # Stainless project name.
+  STAINLESS_PROJECT: llama-stack-client
+
+  # Path to your OpenAPI spec.
+  OAS_PATH: ./client-sdks/stainless/openapi.yml
+
+  # Path to your Stainless config. Optional; only provide this if you prefer
+  # to maintain the ground truth Stainless config in your own repo.
+  CONFIG_PATH: ./client-sdks/stainless/config.yml
+
+  # When to fail the job based on build conclusion.
+  # Options: "never" | "note" | "warning" | "error" | "fatal".
+  FAIL_ON: error
+
+  # In your repo secrets, configure:
+  # - STAINLESS_API_KEY: a Stainless API key, which you can generate on the
+  #   Stainless organization dashboard
+
+jobs:
+  compute-branch:
+    runs-on: ubuntu-latest
+    outputs:
+      preview_branch: ${{ steps.compute.outputs.preview_branch }}
+      base_branch: ${{ steps.compute.outputs.base_branch }}
+      merge_branch: ${{ steps.compute.outputs.merge_branch }}
+    steps:
+      - name: Compute branch names
+        id: compute
+        run: |
+          HEAD_REPO="${{ github.event.pull_request.head.repo.full_name }}"
+          BASE_REPO="${{ github.repository }}"
+          BRANCH_NAME="${{ github.event.pull_request.head.ref }}"
+          FORK_OWNER="${{ github.event.pull_request.head.repo.owner.login }}"
+
+          if [ "$HEAD_REPO" != "$BASE_REPO" ]; then
+            # Fork PR: prefix with fork owner for isolation
+            if [ -z "$FORK_OWNER" ]; then
+              echo "Error: Fork PR detected but fork owner is empty" >&2
+              exit 1
+            fi
+            PREVIEW_BRANCH="preview/${FORK_OWNER}/${BRANCH_NAME}"
+            BASE_BRANCH="preview/base/${FORK_OWNER}/${BRANCH_NAME}"
+          else
+            # Same-repo PR
+            PREVIEW_BRANCH="preview/${BRANCH_NAME}"
+            BASE_BRANCH="preview/base/${BRANCH_NAME}"
+          fi
+
+          echo "preview_branch=${PREVIEW_BRANCH}" >> $GITHUB_OUTPUT
+          echo "base_branch=${BASE_BRANCH}" >> $GITHUB_OUTPUT
+          echo "merge_branch=${PREVIEW_BRANCH}" >> $GITHUB_OUTPUT
+
+  preview:
+    needs: compute-branch
+    if: github.event.action != 'closed'
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write
+    steps:
+      # Checkout the PR's code to access the OpenAPI spec and config files.
+      # This is necessary to read the spec/config from the PR (including from forks).
+      - name: Checkout repository
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        with:
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.sha }}
+          fetch-depth: 2
+
+      - name: Run preview builds
+        uses: stainless-api/upload-openapi-spec-action/preview@9133735bca5ce0a1df7d3b26e75364e26137a016 # 1.7.0
+        with:
+          stainless_api_key: ${{ secrets.STAINLESS_API_KEY }}
+          org: ${{ env.STAINLESS_ORG }}
+          project: ${{ env.STAINLESS_PROJECT }}
+          oas_path: ${{ env.OAS_PATH }}
+          config_path: ${{ env.CONFIG_PATH }}
+          fail_on: ${{ env.FAIL_ON }}
+          base_sha: ${{ github.event.pull_request.base.sha }}
+          base_ref: ${{ github.event.pull_request.base.ref }}
+          head_sha: ${{ github.event.pull_request.head.sha }}
+          branch: ${{ needs.compute-branch.outputs.preview_branch }}
+          base_branch: ${{ needs.compute-branch.outputs.base_branch }}
+
+  merge:
+    needs: compute-branch
+    if: github.event.action == 'closed' && github.event.pull_request.merged == true
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write
+    steps:
+      # Checkout the PR's code to access the OpenAPI spec and config files.
+      # This is necessary to read the spec/config from the PR (including from forks).
+      - name: Checkout repository
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        with:
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.sha }}
+          fetch-depth: 2
+
+      # Note that this only merges in changes that happened on the last build on
+      # the computed preview branch. It's possible that there are OAS/config
+      # changes that haven't been built, if the preview job didn't finish
+      # before this step starts. In theory we want to wait for all builds
+      # against the preview branch to complete, but assuming that
+      # the preview job happens before the PR merge, it should be fine.
+      - name: Run merge build
+        uses: stainless-api/upload-openapi-spec-action/merge@9133735bca5ce0a1df7d3b26e75364e26137a016 # 1.7.0
+        with:
+          stainless_api_key: ${{ secrets.STAINLESS_API_KEY }}
+          org: ${{ env.STAINLESS_ORG }}
+          project: ${{ env.STAINLESS_PROJECT }}
+          oas_path: ${{ env.OAS_PATH }}
+          config_path: ${{ env.CONFIG_PATH }}
+          fail_on: ${{ env.FAIL_ON }}
+          base_sha: ${{ github.event.pull_request.base.sha }}
+          base_ref: ${{ github.event.pull_request.base.ref }}
+          head_sha: ${{ github.event.pull_request.head.sha }}
+          merge_branch: ${{ needs.compute-branch.outputs.merge_branch }}
--- a/.github/workflows/test-external-provider-module.yml
+++ b/.github/workflows/test-external-provider-module.yml
@ -8,7 +8,7 @@ on:
  pull_request:
    branches: [ main ]
    paths:
-      - 'llama_stack/**'
+      - 'src/llama_stack/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -27,7 +27,7 @@ jobs:
        # container and point 'uv pip install' to the correct path...
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -46,9 +46,9 @@ jobs:
          yq -i '.image_type = "${{ matrix.image-type }}"' tests/external/ramalama-stack/run.yaml
          cat tests/external/ramalama-stack/run.yaml

-      - name: Build distro from config file
+      - name: Install distribution dependencies
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/ramalama-stack/build.yaml
+          uv run llama stack list-deps tests/external/ramalama-stack/build.yaml | xargs -L1 uv pip install

      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
@ -78,7 +78,7 @@ jobs:

      - name: Upload all logs to artifacts
        if: ${{ always() }}
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
        with:
          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-external-provider-module-test
          path: |
--- a/.github/workflows/test-external.yml
+++ b/.github/workflows/test-external.yml
@ -8,8 +8,8 @@ on:
  pull_request:
    branches: [ main ]
    paths:
-      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
+      - 'src/llama_stack/**'
+      - '!src/llama_stack_ui/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -27,7 +27,7 @@ jobs:
        # container and point 'uv pip install' to the correct path...
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -44,11 +44,14 @@ jobs:

      - name: Print distro dependencies
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml --print-deps-only
+          uv run --no-sync llama stack list-deps tests/external/build.yaml

      - name: Build distro from config file
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml
+          uv venv ci-test
+          source ci-test/bin/activate
+          uv pip install -e .
+          LLAMA_STACK_LOGGING=all=CRITICAL llama stack list-deps tests/external/build.yaml | xargs -L1 uv pip install

      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
@ -81,7 +84,7 @@ jobs:

      - name: Upload all logs to artifacts
        if: ${{ always() }}
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
        with:
          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-external-test
          path: |
--- a/.github/workflows/ui-unit-tests.yml
+++ b/.github/workflows/ui-unit-tests.yml
@ -8,7 +8,7 @@ on:
  pull_request:
    branches: [ main ]
    paths:
-      - 'llama_stack/ui/**'
+      - 'src/llama_stack_ui/**'
      - '.github/workflows/ui-unit-tests.yml' # This workflow
  workflow_dispatch:

@ -26,29 +26,29 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0

      - name: Setup Node.js
-        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
+        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
        with:
          node-version: ${{ matrix.node-version }}
          cache: 'npm'
-          cache-dependency-path: 'llama_stack/ui/package-lock.json'
+          cache-dependency-path: 'src/llama_stack_ui/package-lock.json'

      - name: Install dependencies
-        working-directory: llama_stack/ui
+        working-directory: src/llama_stack_ui
        run: npm ci

      - name: Run linting
-        working-directory: llama_stack/ui
+        working-directory: src/llama_stack_ui
        run: npm run lint

      - name: Run format check
-        working-directory: llama_stack/ui
+        working-directory: src/llama_stack_ui
        run: npm run format:check

      - name: Run unit tests
-        working-directory: llama_stack/ui
+        working-directory: src/llama_stack_ui
        env:
          CI: true

--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@ -4,12 +4,16 @@ run-name: Run the unit test suite

 on:
  push:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
  pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release-[0-9]+.[0-9]+.x'
    paths:
-      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
+      - 'src/llama_stack/**'
+      - '!src/llama_stack_ui/**'
      - 'tests/unit/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -32,7 +36,7 @@ jobs:
          - "3.13"
    steps:
      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0

      - name: Install dependencies
        uses: ./.github/actions/setup-runner
@ -45,7 +49,7 @@ jobs:

      - name: Upload test results
        if: always()
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
        with:
          name: test-results-${{ matrix.python }}
          path: |
--- a/.gitignore
+++ b/.gitignore
@ -31,3 +31,9 @@ CLAUDE.md
 .claude/
 docs/.docusaurus/
 docs/node_modules/
+docs/static/imported-files/
+docs/docs/api-deprecated/
+docs/docs/api-experimental/
+docs/docs/api/
+tests/integration/client-typescript/node_modules/
+.ts-client-checkout/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,5 +1,5 @@
 exclude: 'build/'
-
+minimum_pre_commit_version: 4.4.0
 default_language_version:
    python: python3.12
    node: "22"
@ -42,7 +42,6 @@ repos:
    hooks:
    -   id: ruff
        args: [ --fix ]
-        exclude: ^llama_stack/strong_typing/.*$
    -   id: ruff-format

 -   repo: https://github.com/adamchainz/blacken-docs
@ -52,13 +51,9 @@ repos:
        additional_dependencies:
        - black==24.3.0

-   repo: https://github.com/astral-sh/uv-pre-commit
-    rev: 0.7.20
-    hooks:
-    -   id: uv-lock

 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.16.1
+    rev: v1.18.2
    hooks:
    -   id: mypy
        additional_dependencies:
@ -78,33 +73,48 @@ repos:

 -   repo: local
    hooks:
+      - id: uv-lock
+        name: uv-lock
+        additional_dependencies:
+          - uv==0.7.20
+        entry: ./scripts/uv-run-with-index.sh lock
+        language: python
+        pass_filenames: false
+        require_serial: true
+        files: ^(pyproject\.toml|uv\.lock)$
+      - id: mypy-full
+        name: mypy (full type_checking)
+        entry: ./scripts/uv-run-with-index.sh run --group dev --group type_checking mypy
+        language: system
+        pass_filenames: false
+        stages: [manual]
      - id: distro-codegen
        name: Distribution Template Codegen
        additional_dependencies:
          - uv==0.7.8
-        entry: uv run --group codegen ./scripts/distro_codegen.py
+        entry: ./scripts/uv-run-with-index.sh run --group codegen ./scripts/distro_codegen.py
        language: python
        pass_filenames: false
        require_serial: true
-        files: ^llama_stack/distributions/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
+        files: ^src/llama_stack/distributions/.*$|^src/llama_stack/providers/.*/inference/.*/models\.py$
      - id: provider-codegen
        name: Provider Codegen
        additional_dependencies:
          - uv==0.7.8
-        entry: uv run --group codegen ./scripts/provider_codegen.py
+        entry: ./scripts/uv-run-with-index.sh run --group codegen ./scripts/provider_codegen.py
        language: python
        pass_filenames: false
        require_serial: true
-        files: ^llama_stack/providers/.*$
+        files: ^src/llama_stack/providers/.*$|^scripts/run_openapi_generator.sh$
      - id: openapi-codegen
        name: API Spec Codegen
        additional_dependencies:
          - uv==0.7.8
-        entry: sh -c 'uv run ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
+        entry: sh -c './scripts/uv-run-with-index.sh run scripts/run_openapi_generator.sh'
        language: python
        pass_filenames: false
        require_serial: true
-        files: ^llama_stack/apis/|^docs/openapi_generator/
+        files: ^src/llama_stack_api/.*$
      - id: check-workflows-use-hashes
        name: Check GitHub Actions use SHA-pinned actions
        entry: ./scripts/check-workflows-use-hashes.sh
@ -120,7 +130,7 @@ repos:
        pass_filenames: false
        require_serial: true
        always_run: true
-        files: ^llama_stack/.*$
+        files: ^src/llama_stack/.*$
      - id: forbid-pytest-asyncio
        name: Block @pytest.mark.asyncio and @pytest_asyncio.fixture
        entry: bash
@ -141,7 +151,7 @@ repos:
        name: Generate CI documentation
        additional_dependencies:
          - uv==0.7.8
-        entry: uv run ./scripts/gen-ci-docs.py
+        entry: ./scripts/uv-run-with-index.sh run ./scripts/gen-ci-docs.py
        language: python
        pass_filenames: false
        require_serial: true
@ -150,7 +160,7 @@ repos:
        name: Format & Lint UI
        entry: bash ./scripts/run-ui-linter.sh
        language: system
-        files: ^llama_stack/ui/.*\.(ts|tsx)$
+        files: ^src/llama_stack_ui/.*\.(ts|tsx)$
        pass_filenames: false
        require_serial: true

@ -172,6 +182,44 @@ repos:
              exit 1
            fi
            exit 0
+      - id: fips-compliance
+        name: Ensure llama-stack remains FIPS compliant
+        entry: bash
+        language: system
+        types: [python]
+        pass_filenames: true
+        exclude: '^tests/.*$'  # Exclude test dir as some safety tests used MD5
+        args:
+          - -c
+          - |
+            grep -EnH '^[^#]*\b(md5|sha1|uuid3|uuid5)\b' "$@" && {
+              echo;
+              echo "❌ Do not use any of the following functions: hashlib.md5, hashlib.sha1, uuid.uuid3, uuid.uuid5"
+              echo "   These functions are not FIPS-compliant"
+              echo;
+              exit 1;
+            } || true
+      - id: check-api-independence
+        name: Ensure llama_stack_api does not import llama_stack
+        entry: bash
+        language: system
+        pass_filenames: false
+        require_serial: true
+        always_run: true
+        files: ^src/llama_stack_api/.*$
+        args:
+          - -c
+          - |
+            API_DIR="src/llama_stack_api"
+            grep -rn --include="*.py" -E '^[^#]*(import llama_stack\b|from llama_stack\b)' "$API_DIR" 2>/dev/null && {
+              echo "llama_stack_api must not import llama_stack";
+              exit 1;
+            }
+            [ -f "$API_DIR/pyproject.toml" ] && grep -n 'llama_stack[^_]' "$API_DIR/pyproject.toml" && {
+              echo "llama_stack_api must not depend on llama_stack in pyproject.toml";
+              exit 1;
+            }
+            exit 0

 ci:
    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,614 +0,0 @@
-# Changelog
-
-# v0.2.20
-Published on: 2025-08-29T22:25:32Z
-
-Here are some key changes that are coming as part of this release.
-
-### Build and Environment
-
- Environment improvements: fixed env var replacement to preserve types.
- Docker stability: fixed container startup failures for Fireworks AI provider.
- Removed absolute paths in build for better portability.
-
-### Features
-
- UI Enhancements: Implemented file upload and VectorDB creation/configuration directly in UI.
- Vector Store Improvements: Added keyword, vector, and hybrid search inside vector store.
- Added S3 authorization support for file providers.
- SQL Store: Added inequality support to where clause.
-
-### Documentation
-
- Fixed post-training docs.
- Added Contributor Guidelines for creating Internal vs. External providers.
-
-### Fixes
-
- Removed unsupported bfcl scoring function.
- Multiple reliability and configuration fixes for providers and environment handling.
-
-### Engineering / Chores
-
- Cleaner internal development setup with consistent paths.
- Incremental improvements to provider integration and vector store behavior.
-
-
-### New Contributors
- @omertuc made their first contribution in #3270
- @r3v5 made their first contribution in vector store hybrid search
-
---
-
-# v0.2.19
-Published on: 2025-08-26T22:06:55Z
-
-## Highlights
-* feat: Add CORS configuration support for server by @skamenan7 in https://github.com/llamastack/llama-stack/pull/3201
-* feat(api): introduce /rerank by @ehhuang in https://github.com/llamastack/llama-stack/pull/2940
-* feat: Add S3 Files Provider by @mattf in https://github.com/llamastack/llama-stack/pull/3202
-
-
---
-
-# v0.2.18
-Published on: 2025-08-20T01:09:27Z
-
-## Highlights
-* Add moderations create API
-* Hybrid search in Milvus
-* Numerous Responses API improvements
-* Documentation updates
-
-
---
-
-# v0.2.17
-Published on: 2025-08-05T01:51:14Z
-
-## Highlights
-
-* feat(tests): introduce inference record/replay to increase test reliability by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2941
-* fix(library_client): improve initialization error handling and prevent AttributeError by @mattf in https://github.com/meta-llama/llama-stack/pull/2944
-* fix: use OLLAMA_URL to activate Ollama provider in starter by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2963
-* feat(UI): adding MVP playground UI by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/2828
-* Standardization of errors (@nathan-weinberg)
-* feat: Enable DPO training with HuggingFace inline provider by @Nehanth in https://github.com/meta-llama/llama-stack/pull/2825
-* chore: rename templates to distributions by @ashwinb in https://github.com/meta-llama/llama-stack/pull/3035
-
-
---
-
-# v0.2.16
-Published on: 2025-07-28T23:35:23Z
-
-## Highlights
-
-* Automatic model registration for self-hosted providers (ollama and vllm currently). No need for `INFERENCE_MODEL` environment variables which need to be updated, etc.
-* Much simplified starter distribution. Most `ENABLE_` env variables are now gone. When you set `VLLM_URL`, the `vllm` provider is auto-enabled. Similar for `MILVUS_URL`, `PGVECTOR_DB`, etc. Check the [run.yaml](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/templates/starter/run.yaml) for more details.
-* All tests migrated to pytest now (thanks @Elbehery)
-* DPO implementation in the post-training provider (thanks @Nehanth)
-* (Huge!) Support for external APIs and providers thereof (thanks @leseb, @cdoern and others). This is a really big deal -- you can now add more APIs completely out of tree and experiment with them before (optionally) wanting to contribute back.
-* `inline::vllm` provider is gone thank you very much
-* several improvements to OpenAI inference implementations and LiteLLM backend (thanks @mattf)
-* Chroma now supports Vector Store API (thanks @franciscojavierarceo).
-* Authorization improvements: Vector Store/File APIs now supports access control (thanks @franciscojavierarceo); Telemetry read APIs are gated according to logged-in user's roles.
-
-
-
---
-
-# v0.2.15
-Published on: 2025-07-16T03:30:01Z
-
-
-
---
-
-# v0.2.14
-Published on: 2025-07-04T16:06:48Z
-
-## Highlights
-
-* Support for Llama Guard 4
-* Added Milvus  support to vector-stores API
-* Documentation and zero-to-hero updates for latest APIs
-
-
---
-
-# v0.2.13
-Published on: 2025-06-28T04:28:11Z
-
-## Highlights
-* search_mode support in OpenAI vector store API
-* Security fixes
-
-
---
-
-# v0.2.12
-Published on: 2025-06-20T22:52:12Z
-
-## Highlights
-* Filter support in file search
-* Support auth attributes in inference and response stores
-
-
---
-
-# v0.2.11
-Published on: 2025-06-17T20:26:26Z
-
-## Highlights
-* OpenAI-compatible vector store APIs
-* Hybrid Search in Sqlite-vec
-* File search tool in Responses API
-* Pagination in inference and response stores
-* Added `suffix` to completions API for fill-in-the-middle tasks
-
-
---
-
-# v0.2.10.1
-Published on: 2025-06-06T20:11:02Z
-
-## Highlights
-* ChromaDB provider fix
-
-
---
-
-# v0.2.10
-Published on: 2025-06-05T23:21:45Z
-
-## Highlights
-
-* OpenAI-compatible embeddings API
-* OpenAI-compatible Files API
-* Postgres support in starter distro
-* Enable ingestion of precomputed embeddings
-* Full multi-turn support in Responses API
-* Fine-grained access control policy
-
-
---
-
-# v0.2.9
-Published on: 2025-05-30T20:01:56Z
-
-## Highlights
-* Added initial streaming support in Responses API
-* UI view for Responses
-* Postgres inference store support
-
-
---
-
-# v0.2.8
-Published on: 2025-05-27T21:03:47Z
-
-# Release v0.2.8
-
-## Highlights
-
-* Server-side MCP with auth firewalls now works in the Stack - both for Agents and Responses
-* Get chat completions APIs and UI to show chat completions
-* Enable keyword search for sqlite-vec
-
-
---
-
-# v0.2.7
-Published on: 2025-05-16T20:38:10Z
-
-## Highlights
-
-This is a small update. But a couple highlights:
-
-* feat: function tools in OpenAI Responses by @bbrowning in https://github.com/meta-llama/llama-stack/pull/2094, getting closer to ready. Streaming is the next missing piece.
-* feat: Adding support for customizing chunk context in RAG insertion and querying by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/2134
-* feat: scaffolding for Llama Stack UI by @ehhuang in https://github.com/meta-llama/llama-stack/pull/2149, more to come in the coming releases.
-
-
---
-
-# v0.2.6
-Published on: 2025-05-12T18:06:52Z
-
-
-
---
-
-# v0.2.5
-Published on: 2025-05-04T20:16:49Z
-
-
-
---
-
-# v0.2.4
-Published on: 2025-04-29T17:26:01Z
-
-## Highlights
-
-* One-liner to install and run Llama Stack yay! by @reluctantfuturist in https://github.com/meta-llama/llama-stack/pull/1383
-* support for NVIDIA NeMo datastore by @raspawar in https://github.com/meta-llama/llama-stack/pull/1852
-* (yuge!) Kubernetes authentication by @leseb in https://github.com/meta-llama/llama-stack/pull/1778
-* (yuge!) OpenAI Responses API by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1989
-* add api.llama provider, llama-guard-4 model by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2058
-
-
---
-
-# v0.2.3
-Published on: 2025-04-25T22:46:21Z
-
-## Highlights
-
-* OpenAI compatible inference endpoints and client-SDK support. `client.chat.completions.create()` now works.
-* significant improvements and functionality added to the nVIDIA distribution
-* many improvements to the test verification suite.
-* new inference providers: Ramalama, IBM WatsonX
-* many improvements to the Playground UI
-
-
---
-
-# v0.2.2
-Published on: 2025-04-13T01:19:49Z
-
-## Main changes
-
- Bring Your Own Provider (@leseb) - use out-of-tree provider code to execute the distribution server
- OpenAI compatible inference API in progress (@bbrowning)
- Provider verifications (@ehhuang)
- Many updates and fixes to playground
- Several llama4 related fixes
-
-
---
-
-# v0.2.1
-Published on: 2025-04-05T23:13:00Z
-
-
-
---
-
-# v0.2.0
-Published on: 2025-04-05T19:04:29Z
-
-## Llama 4 Support
-
-Checkout more at https://www.llama.com
-
-
-
---
-
-# v0.1.9
-Published on: 2025-03-29T00:52:23Z
-
-### Build and Test Agents
-* Agents: Entire document context with attachments
-* RAG: Documentation with sqlite-vec faiss comparison
-* Getting started: Fixes to getting started notebook.
-
-### Agent Evals and Model Customization
-* (**New**) Post-training: Add nemo customizer
-
-### Better Engineering
-* Moved sqlite-vec to non-blocking calls
-* Don't return a payload on file delete
-
-
-
---
-
-# v0.1.8
-Published on: 2025-03-24T01:28:50Z
-
-# v0.1.8 Release Notes
-
-### Build and Test Agents
-* Safety: Integrated NVIDIA as a safety provider.
-* VectorDB: Added Qdrant as an inline provider.
-* Agents: Added support for multiple tool groups in agents.
-* Agents: Simplified imports for Agents in client package
-
-
-### Agent Evals and Model Customization
-* Introduced DocVQA and IfEval benchmarks.
-
-### Deploying and Monitoring Agents
-* Introduced a Containerfile and image workflow for the Playground.
-* Implemented support for Bearer (API Key) authentication.
-* Added attribute-based access control for resources.
-* Fixes on docker deployments: use --pull always and standardized the default port to 8321
-* Deprecated: /v1/inspect/providers use /v1/providers/ instead
-
-### Better Engineering
-* Consolidated scripts under the ./scripts directory.
-* Addressed mypy violations in various modules.
-* Added Dependabot scans for Python dependencies.
-* Implemented a scheduled workflow to update the changelog automatically.
-* Enforced concurrency to reduce CI loads.
-
-
-### New Contributors
-* @cmodi-meta made their first contribution in https://github.com/meta-llama/llama-stack/pull/1650
-* @jeffmaury made their first contribution in https://github.com/meta-llama/llama-stack/pull/1671
-* @derekhiggins made their first contribution in https://github.com/meta-llama/llama-stack/pull/1698
-* @Bobbins228 made their first contribution in https://github.com/meta-llama/llama-stack/pull/1745
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.7...v0.1.8
-
---
-
-# v0.1.7
-Published on: 2025-03-14T22:30:51Z
-
-## 0.1.7 Release Notes
-
-###  Build and Test Agents
-* Inference: ImageType is now refactored to LlamaStackImageType
-* Inference: Added tests to measure TTFT
-* Inference: Bring back usage metrics
-* Agents: Added endpoint for get agent, list agents and list sessions
-* Agents: Automated conversion of type hints in client tool for lite llm format
-* Agents: Deprecated ToolResponseMessage in agent.resume API
-* Added Provider API for listing and inspecting provider info
-
-### Agent Evals and Model Customization
-* Eval: Added new eval benchmarks Math 500 and BFCL v3
-* Deploy and Monitoring of Agents
-* Telemetry: Fix tracing to work across coroutines
-
-###  Better Engineering
-* Display code coverage for unit tests
-* Updated call sites (inference, tool calls, agents) to move to async non blocking calls
-* Unit tests also run on Python 3.11, 3.12, and 3.13
-* Added ollama inference to Integration tests CI
-* Improved documentation across examples, testing, CLI, updated providers table )
-
-
-
-
---
-
-# v0.1.6
-Published on: 2025-03-08T04:35:08Z
-
-## 0.1.6 Release Notes
-
-### Build and Test Agents
-* Inference: Fixed support for inline vllm provider
-* (**New**) Agent: Build & Monitor Agent Workflows with Llama Stack + Anthropic's Best Practice [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb)
-* (**New**) Agent: Revamped agent [documentation](https://llama-stack.readthedocs.io/en/latest/building_applications/agent.html) with more details and examples
-* Agent: Unify tools and Python SDK Agents API
-* Agent: AsyncAgent Python SDK wrapper supporting async client tool calls
-* Agent: Support python functions without @client_tool decorator as client tools
-* Agent: deprecation for allow_resume_turn flag, and remove need to specify tool_prompt_format
-* VectorIO: MilvusDB support added
-
-### Agent Evals and Model Customization
-* (**New**) Agent: Llama Stack RAG Lifecycle [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb)
-* Eval: Documentation for eval, scoring, adding new benchmarks
-* Eval: Distribution template to run benchmarks on llama & non-llama models
-* Eval: Ability to register new custom LLM-as-judge scoring functions
-* (**New**) Looking for contributors for open benchmarks. See [documentation](https://llama-stack.readthedocs.io/en/latest/references/evals_reference/index.html#open-benchmark-contributing-guide) for details.
-
-### Deploy and Monitoring of Agents
-* Better support for different log levels across all components for better monitoring
-
-### Better Engineering
-* Enhance OpenAPI spec to include Error types across all APIs
-* Moved all tests to /tests and created unit tests to run on each PR
-* Removed all dependencies on llama-models repo
-
-
---
-
-# v0.1.5.1
-Published on: 2025-02-28T22:37:44Z
-
-## 0.1.5.1 Release Notes
-* Fixes for security risk in https://github.com/meta-llama/llama-stack/pull/1327 and https://github.com/meta-llama/llama-stack/pull/1328
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.5...v0.1.5.1
-
---
-
-# v0.1.5
-Published on: 2025-02-28T18:14:01Z
-
-## 0.1.5 Release Notes
-###  Build Agents
-* Inference: Support more non-llama models (openai, anthropic, gemini)
-* Inference: Can use the provider's model name in addition to the HF alias
-* Inference: Fixed issues with calling tools that weren't specified in the prompt
-* RAG: Improved system prompt for RAG and no more need for hard-coded rag-tool calling
-* Embeddings: Added support for Nemo retriever embedding models
-* Tools: Added support for MCP tools in Ollama Distribution
-* Distributions: Added new Groq distribution
-
-### Customize Models
-* Save post-trained checkpoint in SafeTensor format to allow Ollama inference provider to use the post-trained model
-
-### Monitor agents
-* More comprehensive logging of agent steps including client tools
-* Telemetry inputs/outputs are now structured and queryable
-* Ability to retrieve agents session, turn, step by ids
-
-### Better Engineering
-* Moved executorch Swift code out of this repo into the llama-stack-client-swift repo, similar to kotlin
-* Move most logging to use logger instead of prints
-* Completed text /chat-completion and /completion tests
-
-
---
-
-# v0.1.4
-Published on: 2025-02-25T00:02:43Z
-
-## v0.1.4 Release Notes
-Here are the key changes coming as part of this release:
-
-### Build and Test Agents
-* Inference: Added support for non-llama models
-* Inference: Added option to list all downloaded models and remove models
-* Agent: Introduce new api agents.resume_turn to include client side tool execution in the same turn
-* Agent: AgentConfig introduces new variable “tool_config” that allows for better tool configuration and system prompt overrides
-* Agent: Added logging for agent step start and completion times
-* Agent: Added support for logging for tool execution metadata
-* Embedding: Updated /inference/embeddings to support asymmetric models, truncation and variable sized outputs
-* Embedding: Updated embedding models for Ollama, Together, and Fireworks with available defaults
-* VectorIO: Improved performance of sqlite-vec using chunked writes
-### Agent Evals and Model Customization
-* Deprecated api /eval-tasks. Use /eval/benchmark  instead
-* Added CPU training support for TorchTune
-### Deploy and Monitoring of Agents
-* Consistent view of client and server tool calls in telemetry
-### Better Engineering
-* Made tests more data-driven for consistent evaluation
-* Fixed documentation links and improved API reference generation
-* Various small fixes for build scripts and system reliability
-
-
-
---
-
-# v0.1.3
-Published on: 2025-02-14T20:24:32Z
-
-## v0.1.3 Release
-
-Here are some key changes that are coming as part of this release.
-
-### Build and Test Agents
-Streamlined the initial development experience
- Added support for  llama stack run --image-type venv
- Enhanced vector store options with new sqlite-vec provider and improved Qdrant integration
- vLLM improvements for tool calling and logprobs
- Better handling of sporadic code_interpreter tool calls
-
-### Agent Evals
-Better benchmarking and Agent performance assessment
- Renamed eval API /eval-task to /benchmarks
- Improved documentation and notebooks for RAG and evals
-
-### Deploy and Monitoring of Agents
-Improved production readiness
- Added usage metrics collection for chat completions
- CLI improvements for provider information
- Improved error handling and system reliability
- Better model endpoint handling and accessibility
- Improved signal handling on distro server
-
-### Better Engineering
-Infrastructure and code quality improvements
- Faster text-based chat completion tests
- Improved testing for non-streaming agent apis
- Standardized import formatting with ruff linter
- Added conventional commits standard
- Fixed documentation parsing issues
-
-
---
-
-# v0.1.2
-Published on: 2025-02-07T22:06:49Z
-
-# TL;DR
- Several stabilizations to development flows after the switch to `uv`
- Migrated CI workflows to new OSS repo - [llama-stack-ops](https://github.com/meta-llama/llama-stack-ops)
- Added automated rebuilds for ReadTheDocs
- Llama Stack server supports HTTPS
- Added system prompt overrides support
- Several bug fixes and improvements to documentation (check out Kubernetes deployment guide by @terrytangyuan )
-
-
---
-
-# v0.1.1
-Published on: 2025-02-02T02:29:24Z
-
-A bunch of small / big improvements everywhere including support for Windows, switching to `uv` and many provider improvements.
-
-
---
-
-# v0.1.0
-Published on: 2025-01-24T17:47:47Z
-
-We are excited to announce a stable API release of Llama Stack, which enables developers to build RAG applications and Agents using tools and safety shields, monitor and those agents with telemetry, and evaluate the agent with scoring functions.
-
-## Context
-GenAI application developers need more than just an LLM - they need to integrate tools, connect with their data sources, establish guardrails, and ground the LLM responses effectively. Currently, developers must piece together various tools and APIs, complicating the development lifecycle and increasing costs. The result is that developers are spending more time on these integrations rather than focusing on the application logic itself. The bespoke coupling of components also makes it challenging to adopt state-of-the-art solutions in the rapidly evolving GenAI space. This is particularly difficult for open models like Llama, as best practices are not widely established in the open.
-
-Llama Stack was created to provide developers with a comprehensive and coherent interface that simplifies AI application development and codifies best practices across the Llama ecosystem. Since our launch in September 2024, we have seen a huge uptick in interest in Llama Stack APIs by both AI developers and from partners building AI services with Llama models. Partners like Nvidia, Fireworks, and Ollama have collaborated with us to develop implementations across various APIs, including inference, memory, and safety.
-
-With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.
-
-## Release
-After iterating on the APIs for the last 3 months, today we’re launching a stable release (V1) of the Llama Stack APIs and the corresponding llama-stack server and client packages(v0.1.0). We now have automated tests for providers. These tests make sure that all provider implementations are verified. Developers can now easily and reliably select distributions or providers based on their specific requirements.
-
-There are example standalone apps in llama-stack-apps.
-
-
-## Key Features of this release
-
- **Unified API Layer**
-  - Inference: Run LLM models
-  - RAG: Store and retrieve knowledge for RAG
-  - Agents: Build multi-step agentic workflows
-  - Tools: Register tools that can be called by the agent
-  - Safety: Apply content filtering and safety policies
-  - Evaluation: Test model and agent quality
-  - Telemetry: Collect and analyze usage data and complex agentic traces
-  - Post Training ( Coming Soon ): Fine tune models for specific use cases
-
- **Rich Provider Ecosystem**
-  - Local Development: Meta's Reference, Ollama
-  - Cloud: Fireworks, Together, Nvidia, AWS Bedrock, Groq, Cerebras
-  - On-premises: Nvidia NIM, vLLM, TGI, Dell-TGI
-  - On-device: iOS and Android support
-
- **Built for Production**
-  - Pre-packaged distributions for common deployment scenarios
-  - Backwards compatibility across model versions
-  - Comprehensive evaluation capabilities
-  - Full observability and monitoring
-
- **Multiple developer interfaces**
-  - CLI: Command line interface
-  - Python SDK
-  - Swift iOS SDK
-  - Kotlin Android SDK
-
- **Sample llama stack applications**
-  - Python
-  - iOS
-  - Android
-
-
-
---
-
-# v0.1.0rc12
-Published on: 2025-01-22T22:24:01Z
-
-
-
---
-
-# v0.0.63
-Published on: 2024-12-18T07:17:43Z
-
-A small but important bug-fix release to update the URL datatype for the client-SDKs. The issue affected multimodal agentic turns especially.
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.62...v0.0.63
-
---
-
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -11,14 +11,17 @@ You can install the dependencies by running:

 ```bash
 cd llama-stack
+uv venv --python 3.12
 uv sync --group dev
 uv pip install -e .
 source .venv/bin/activate
 ```

 ```{note}
-You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.12`).
-Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
+If you are making changes to Llama Stack, it is essential that you use Python 3.12 as shown above.
+Llama Stack can work with Python 3.13 but the pre-commit hooks used to validate code changes only work with Python 3.12.
+If you don't specify a Python version, `uv` will automatically select a Python version according to the `requires-python`
+section of the `pyproject.toml`, which is fine for running Llama Stack but not for committing changes.
 For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
 ```

@ -42,15 +45,32 @@ uv run --env-file .env -- pytest -v tests/integration/inference/test_text_infere
 We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:

 ```bash
+uv pip install pre-commit==4.3.0
 uv run pre-commit install
 ```

-After that, pre-commit hooks will run automatically before each commit.
+Note that the only version of pre-commit that works with the Llama Stack continuous integration is `4.3.0` so it is essential that you pull
+that specific version as shown above.  Once you have run these commands, pre-commit hooks will run automatically before each commit.

-Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
+Alternatively, if you don't want to install the pre-commit hooks (or if you want to check if your changes are ready before committing),
+you can run the checks manually by running:

 ```bash
-uv run pre-commit run --all-files
+uv run pre-commit run --all-files -v
+```
+
+The `-v` (verbose) parameter is optional but often helpful for getting more information about any issues with that the pre-commit checks identify.
+
+To run the expanded mypy configuration that CI enforces, use:
+
+```bash
+uv run pre-commit run mypy-full --hook-stage manual --all-files
+```
+
+or invoke mypy directly with all optional dependencies:
+
+```bash
+uv run --group dev --group type_checking mypy
 ```

 ```{caution}
@ -83,6 +103,7 @@ If you are new to the project, start by looking at the issues tagged with "good
 leave a comment on the issue and a triager will assign it to you.

 Please avoid picking up too many issues at once. This helps you stay focused and ensures that others in the community also have opportunities to contribute.
+
 - Try to work on only 1–2 issues at a time, especially if you’re still getting familiar with the codebase.
 - Before taking an issue, check if it’s already assigned or being actively discussed.
 - If you’re blocked or can’t continue with an issue, feel free to unassign yourself or leave a comment so others can step in.
@ -158,9 +179,9 @@ under the LICENSE file in the root directory of this source tree.

 Some tips about common tasks you work on while contributing to Llama Stack:

-### Using `llama stack build`
+### Installing dependencies of distributions

-Building a stack image will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
+When installing dependencies for a distribution, you can use `llama stack list-deps` to view and install the required packages.

 Example:
 ```bash
@ -168,7 +189,12 @@ cd work/
 git clone https://github.com/llamastack/llama-stack.git
 git clone https://github.com/llamastack/llama-stack-client-python.git
 cd llama-stack
-LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --distro <...>
+
+# Show dependencies for a distribution
+llama stack list-deps <distro-name>
+
+# Install dependencies
+llama stack list-deps <distro-name> | xargs -L1 uv pip install
 ```

 ### Updating distribution configurations
@ -191,6 +217,7 @@ If you are making changes to the documentation at [https://llamastack.github.io/

 ```bash
 # This rebuilds the documentation pages and the OpenAPI spec.
+cd docs/
 npm install
 npm run gen-api-docs all
 npm run build
@ -204,7 +231,7 @@ npm run serve
 If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command:

 ```bash
-uv run ./docs/openapi_generator/run_openapi_generator.sh
+uv run ./scripts/run_openapi_generator.sh
 ```

 The generated API schema will be available in `docs/static/`. Make sure to review the changes before committing.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,11 +1,11 @@
 include pyproject.toml
-include llama_stack/models/llama/llama3/tokenizer.model
-include llama_stack/models/llama/llama4/tokenizer.model
-include llama_stack/core/*.sh
-include llama_stack/cli/scripts/*.sh
-include llama_stack/distributions/*/*.yaml
-exclude llama_stack/distributions/ci-tests
+include src/llama_stack/models/llama/llama3/tokenizer.model
+include src/llama_stack/models/llama/llama4/tokenizer.model
+include src/llama_stack/core/*.sh
+include src/llama_stack/cli/scripts/*.sh
+include src/llama_stack/distributions/*/*.yaml
+exclude src/llama_stack/distributions/ci-tests
 include tests/integration/test_cases/inference/*.json
-include llama_stack/models/llama/*/*.md
-include llama_stack/tests/integration/*.jpg
-prune llama_stack/distributions/ci-tests
+include src/llama_stack/models/llama/*/*.md
+include src/llama_stack/tests/integration/*.jpg
+prune src/llama_stack/distributions/ci-tests
--- a/README.md
+++ b/README.md
@ -10,93 +10,19 @@
 [**Quick Start**](https://llamastack.github.io/docs/getting_started/quickstart) | [**Documentation**](https://llamastack.github.io/docs) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)


-### ✨🎉 Llama 4 Support  🎉✨
-We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
-
-<details>
-
-<summary>👋 Click here to see how to run Llama 4 models on Llama Stack </summary>
-
-\
-*Note you need 8xH100 GPU-host to run these models*
-
-```bash
-pip install -U llama_stack
-
-MODEL="Llama-4-Scout-17B-16E-Instruct"
-# get meta url from llama.com
-llama model download --source meta --model-id $MODEL --meta-url <META_URL>
-
-# start a llama stack server
-INFERENCE_MODEL=meta-llama/$MODEL llama stack build --run --template meta-reference-gpu
-
-# install client to interact with the server
-pip install llama-stack-client
-```
-### CLI
-```bash
-# Run a chat completion
-MODEL="Llama-4-Scout-17B-16E-Instruct"
-
-llama-stack-client --endpoint http://localhost:8321 \
-inference chat-completion \
--model-id meta-llama/$MODEL \
--message "write a haiku for meta's llama 4 models"
-
-OpenAIChatCompletion(
-    ...
-    choices=[
-        OpenAIChatCompletionChoice(
-            finish_reason='stop',
-            index=0,
-            message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
-                role='assistant',
-                content='...**Silent minds awaken,**  \n**Whispers of billions of words,**  \n**Reasoning breaks the night.**  \n\n—  \n*This haiku blends the essence of LLaMA 4\'s capabilities with nature-inspired metaphor, evoking its vast training data and transformative potential.*',
-                ...
-            ),
-            ...
-        )
-    ],
-    ...
-)
-```
-### Python SDK
-```python
-from llama_stack_client import LlamaStackClient
-
-client = LlamaStackClient(base_url=f"http://localhost:8321")
-
-model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-prompt = "Write a haiku about coding"
-
-print(f"User> {prompt}")
-response = client.chat.completions.create(
-    model=model_id,
-    messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": prompt},
-    ],
-)
-print(f"Assistant> {response.choices[0].message.content}")
-```
-As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!
-
-
-</details>
-
 ### 🚀 One-Line Installer 🚀

 To try Llama Stack locally, run:

 ```bash
-curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/scripts/install.sh | bash
+curl -LsSf https://github.com/llamastack/llama-stack/raw/main/scripts/install.sh | bash
 ```

 ### Overview

 Llama Stack standardizes the core building blocks that simplify AI application development. It codifies best practices across the Llama ecosystem. More specifically, it provides

- **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
+- **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals.
 - **Plugin architecture** to support the rich ecosystem of different API implementations in various environments, including local development, on-premises, cloud, and mobile.
 - **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment.
 - **Multiple developer interfaces** like CLI and SDKs for Python, Typescript, iOS, and Android.
@ -122,34 +48,34 @@ By reducing friction and complexity, Llama Stack empowers developers to focus on
 Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack.
 Please checkout for [full list](https://llamastack.github.io/docs/providers)

-| API Provider Builder | Environments | Agents | Inference | VectorIO | Safety | Telemetry | Post Training | Eval | DatasetIO |
-|:--------------------:|:------------:|:------:|:---------:|:--------:|:------:|:---------:|:-------------:|:----:|:--------:|
-|    Meta Reference    | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-|      SambaNova       | Hosted | | ✅ | | ✅ | | | | |
-|       Cerebras       | Hosted | | ✅ | | | | | | |
-|      Fireworks       | Hosted | ✅ | ✅ | ✅ | | | | | |
-|     AWS Bedrock      | Hosted | | ✅ | | ✅ | | | | |
-|       Together       | Hosted | ✅ | ✅ | | ✅ | | | | |
-|         Groq         | Hosted | | ✅ | | | | | | |
-|        Ollama        | Single Node | | ✅ | | | | | | |
-|         TGI          | Hosted/Single Node | | ✅ | | | | | | |
-|      NVIDIA NIM      | Hosted/Single Node | | ✅ | | ✅ | | | | |
-|       ChromaDB       | Hosted/Single Node | | | ✅ | | | | | |
-|        Milvus        | Hosted/Single Node | | | ✅ | | | | | |
-|        Qdrant        | Hosted/Single Node | | | ✅ | | | | | |
-|       Weaviate       | Hosted/Single Node | | | ✅ | | | | | |
-|      SQLite-vec      | Single Node | | | ✅ | | | | | |
-|      PG Vector       | Single Node | | | ✅ | | | | | |
-|  PyTorch ExecuTorch  | On-device iOS | ✅ | ✅ | | | | | | |
-|         vLLM         | Single Node | | ✅ | | | | | | |
-|        OpenAI        | Hosted | | ✅ | | | | | | |
-|      Anthropic       | Hosted | | ✅ | | | | | | |
-|        Gemini        | Hosted | | ✅ | | | | | | |
-|       WatsonX        | Hosted | | ✅ | | | | | | |
-|     HuggingFace      | Single Node | | | | | | ✅ | | ✅ |
-|      TorchTune       | Single Node | | | | | | ✅ | | |
-|     NVIDIA NEMO      | Hosted | | ✅ | ✅ | | | ✅ | ✅ | ✅ |
-|        NVIDIA        | Hosted | | | | | | ✅ | ✅ | ✅ |
+| API Provider Builder | Environments | Agents | Inference | VectorIO | Safety | Post Training | Eval | DatasetIO |
+|:--------------------:|:------------:|:------:|:---------:|:--------:|:------:|:-------------:|:----:|:--------:|
+|    Meta Reference    | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|      SambaNova       | Hosted | | ✅ | | ✅ | | | |
+|       Cerebras       | Hosted | | ✅ | | | | | |
+|      Fireworks       | Hosted | ✅ | ✅ | ✅ | | | | |
+|     AWS Bedrock      | Hosted | | ✅ | | ✅ | | | |
+|       Together       | Hosted | ✅ | ✅ | | ✅ | | | |
+|         Groq         | Hosted | | ✅ | | | | | |
+|        Ollama        | Single Node | | ✅ | | | | | |
+|         TGI          | Hosted/Single Node | | ✅ | | | | | |
+|      NVIDIA NIM      | Hosted/Single Node | | ✅ | | ✅ | | | |
+|       ChromaDB       | Hosted/Single Node | | | ✅ | | | | |
+|        Milvus        | Hosted/Single Node | | | ✅ | | | | |
+|        Qdrant        | Hosted/Single Node | | | ✅ | | | | |
+|       Weaviate       | Hosted/Single Node | | | ✅ | | | | |
+|      SQLite-vec      | Single Node | | | ✅ | | | | |
+|      PG Vector       | Single Node | | | ✅ | | | | |
+|  PyTorch ExecuTorch  | On-device iOS | ✅ | ✅ | | | | | |
+|         vLLM         | Single Node | | ✅ | | | | | |
+|        OpenAI        | Hosted | | ✅ | | | | | |
+|      Anthropic       | Hosted | | ✅ | | | | | |
+|        Gemini        | Hosted | | ✅ | | | | | |
+|       WatsonX        | Hosted | | ✅ | | | | | |
+|     HuggingFace      | Single Node | | | | | ✅ | | ✅ |
+|      TorchTune       | Single Node | | | | | ✅ | | |
+|     NVIDIA NEMO      | Hosted | | ✅ | ✅ | | ✅ | ✅ | ✅ |
+|        NVIDIA        | Hosted | | | | | ✅ | ✅ | ✅ |

 > **Note**: Additional providers are available through external packages. See [External Providers](https://llamastack.github.io/docs/providers/external) documentation.

--- a/benchmarking/k8s-benchmark/stack-configmap.yaml
+++ b/benchmarking/k8s-benchmark/stack-configmap.yaml
@ -44,14 +44,6 @@ data:
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
-      files:
-      - provider_id: meta-reference-files
-        provider_type: inline::localfs
-        config:
-          storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
-          metadata_store:
-            type: sqlite
-            db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
      safety:
      - provider_id: llama-guard
        provider_type: inline::llama-guard
@ -98,25 +90,42 @@ data:
      - provider_id: model-context-protocol
        provider_type: remote::model-context-protocol
        config: {}
-    metadata_store:
-      type: postgres
+    storage:
+      backends:
+        kv_default:
+          type: kv_postgres
          host: ${env.POSTGRES_HOST:=localhost}
          port: ${env.POSTGRES_PORT:=5432}
          db: ${env.POSTGRES_DB:=llamastack}
          user: ${env.POSTGRES_USER:=llamastack}
          password: ${env.POSTGRES_PASSWORD:=llamastack}
-      table_name: llamastack_kvstore
-    inference_store:
-      type: postgres
+          table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
+        sql_default:
+          type: sql_postgres
          host: ${env.POSTGRES_HOST:=localhost}
          port: ${env.POSTGRES_PORT:=5432}
          db: ${env.POSTGRES_DB:=llamastack}
          user: ${env.POSTGRES_USER:=llamastack}
          password: ${env.POSTGRES_PASSWORD:=llamastack}
+      stores:
+        metadata:
+          backend: kv_default
+          namespace: registry
+        inference:
+          backend: sql_default
+          table_name: inference_store
+          max_write_queue_size: 10000
+          num_writers: 4
+        conversations:
+          backend: sql_default
+          table_name: openai_conversations
+        prompts:
+          backend: kv_default
+          namespace: prompts
    models:
    - metadata:
-        embedding_dimension: 384
-      model_id: all-MiniLM-L6-v2
+        embedding_dimension: 768
+      model_id: nomic-embed-text-v1.5
      provider_id: sentence-transformers
      model_type: embedding
    - model_id: ${env.INFERENCE_MODEL}
@ -137,5 +146,4 @@ data:
      port: 8323
 kind: ConfigMap
 metadata:
-  creationTimestamp: null
  name: llama-stack-config
--- a/benchmarking/k8s-benchmark/stack_run_config.yaml
+++ b/benchmarking/k8s-benchmark/stack_run_config.yaml
@ -6,7 +6,6 @@ apis:
 - inference
 - files
 - safety
- telemetry
 - tool_runtime
 - vector_io
 providers:
@ -27,28 +26,16 @@ providers:
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
      metadata_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+        table_name: files_metadata
+        backend: sql_default
  vector_io:
  - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
-      kvstore:
-        type: postgres
-        host: ${env.POSTGRES_HOST:=localhost}
-        port: ${env.POSTGRES_PORT:=5432}
-        db: ${env.POSTGRES_DB:=llamastack}
-        user: ${env.POSTGRES_USER:=llamastack}
-        password: ${env.POSTGRES_PASSWORD:=llamastack}
-  files:
-  - provider_id: meta-reference-files
-    provider_type: inline::localfs
-    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
-      metadata_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+      persistence:
+        namespace: vector_io::chroma_remote
+        backend: kv_default
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
@ -58,26 +45,15 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      persistence_store:
-        type: postgres
-        host: ${env.POSTGRES_HOST:=localhost}
-        port: ${env.POSTGRES_PORT:=5432}
-        db: ${env.POSTGRES_DB:=llamastack}
-        user: ${env.POSTGRES_USER:=llamastack}
-        password: ${env.POSTGRES_PASSWORD:=llamastack}
-      responses_store:
-        type: postgres
-        host: ${env.POSTGRES_HOST:=localhost}
-        port: ${env.POSTGRES_PORT:=5432}
-        db: ${env.POSTGRES_DB:=llamastack}
-        user: ${env.POSTGRES_USER:=llamastack}
-        password: ${env.POSTGRES_PASSWORD:=llamastack}
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console}
+      persistence:
+        agent_state:
+          namespace: agents
+          backend: kv_default
+        responses:
+          table_name: responses
+          backend: sql_default
+          max_write_queue_size: 10000
+          num_writers: 4
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
@ -95,40 +71,65 @@ providers:
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
    config: {}
-metadata_store:
-  type: postgres
+storage:
+  backends:
+    kv_default:
+      type: kv_postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
-  table_name: llamastack_kvstore
-inference_store:
-  type: postgres
+      table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
+    sql_default:
+      type: sql_postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
-models:
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
+  stores:
+    metadata:
+      namespace: registry
+      backend: kv_default
+    inference:
+      table_name: inference_store
+      backend: sql_default
+      max_write_queue_size: 10000
+      num_writers: 4
+    conversations:
+      table_name: openai_conversations
+      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
+registered_resources:
+  models:
+  - metadata:
+      embedding_dimension: 768
+    model_id: nomic-embed-text-v1.5
    provider_id: sentence-transformers
    model_type: embedding
- model_id: ${env.INFERENCE_MODEL}
+  - model_id: ${env.INFERENCE_MODEL}
    provider_id: vllm-inference
    model_type: llm
-shields:
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
+  shields:
+  - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
+  vector_dbs: []
+  datasets: []
+  scoring_fns: []
+  benchmarks: []
+  tool_groups:
+  - toolgroup_id: builtin::websearch
    provider_id: tavily-search
- toolgroup_id: builtin::rag
+  - toolgroup_id: builtin::rag
    provider_id: rag-runtime
 server:
  port: 8323
+telemetry:
+  enabled: true
+vector_stores:
+  default_provider_id: chromadb
+  default_embedding_model:
+    provider_id: sentence-transformers
+    model_id: nomic-ai/nomic-embed-text-v1.5
--- a/client-sdks/stainless/README.md
+++ b/client-sdks/stainless/README.md
@ -0,0 +1,11 @@
+These are the source-of-truth configuration files used to generate the Stainless client SDKs via Stainless.
+
+- `openapi.yml`: this is the OpenAPI specification for the Llama Stack API.
+- `config.yml`: this is the Stainless _configuration_ which instructs Stainless how to generate the client SDKs.
+
+A small side note: notice the `.yml` suffixes since Stainless uses that suffix typically for its configuration files.
+
+These files go hand-in-hand. Both `openapi.yml` and `config.yml` are generated by `scripts/run_openapi_generator.sh`:
+
+- `openapi.yml` comes from the FastAPI-based generator.
+- `config.yml` is rendered from `scripts/openapi_generator/stainless_config/config_data.py` so the Stainless config stays in lock-step with the spec.
--- a/client-sdks/stainless/config.yml
+++ b/client-sdks/stainless/config.yml
@ -0,0 +1,490 @@
+# yaml-language-server: $schema=https://app.stainlessapi.com/config-internal.schema.json
+
+organization:
+  name: llama-stack-client
+  docs: https://llama-stack.readthedocs.io/en/latest/
+  contact: llamastack@meta.com
+security:
+- {}
+- BearerAuth: []
+security_schemes:
+  BearerAuth:
+    type: http
+    scheme: bearer
+targets:
+  node:
+    package_name: llama-stack-client
+    production_repo: llamastack/llama-stack-client-typescript
+    publish:
+      npm: false
+  python:
+    package_name: llama_stack_client
+    production_repo: llamastack/llama-stack-client-python
+    options:
+      use_uv: true
+    publish:
+      pypi: true
+    project_name: llama_stack_client
+  kotlin:
+    reverse_domain: com.llama_stack_client.api
+    production_repo: null
+    publish:
+      maven: false
+  go:
+    package_name: llama-stack-client
+    production_repo: llamastack/llama-stack-client-go
+    options:
+      enable_v2: true
+      back_compat_use_shared_package: false
+client_settings:
+  default_env_prefix: LLAMA_STACK_CLIENT
+  opts:
+    api_key:
+      type: string
+      read_env: LLAMA_STACK_CLIENT_API_KEY
+      auth:
+        security_scheme: BearerAuth
+      nullable: true
+environments:
+  production: http://any-hosted-llama-stack.com
+pagination:
+- name: datasets_iterrows
+  type: offset
+  request:
+    dataset_id:
+      type: string
+    start_index:
+      type: integer
+      x-stainless-pagination-property:
+        purpose: offset_count_param
+    limit:
+      type: integer
+  response:
+    data:
+      type: array
+      items:
+        type: object
+    next_index:
+      type: integer
+      x-stainless-pagination-property:
+        purpose: offset_count_start_field
+- name: openai_cursor_page
+  type: cursor
+  request:
+    limit:
+      type: integer
+    after:
+      type: string
+      x-stainless-pagination-property:
+        purpose: next_cursor_param
+  response:
+    data:
+      type: array
+      items: {}
+    has_more:
+      type: boolean
+    last_id:
+      type: string
+      x-stainless-pagination-property:
+        purpose: next_cursor_field
+settings:
+  license: MIT
+  unwrap_response_fields:
+  - data
+  file_header: 'Copyright (c) Meta Platforms, Inc. and affiliates.
+
+    All rights reserved.
+
+
+    This source code is licensed under the terms described in the LICENSE file in
+
+    the root directory of this source tree.
+
+    '
+openapi:
+  transformations:
+  - command: mergeObject
+    reason: Better return_type using enum
+    args:
+      target:
+      - $.components.schemas
+      object:
+        ReturnType:
+          additionalProperties: false
+          properties:
+            type:
+              enum:
+              - string
+              - number
+              - boolean
+              - array
+              - object
+              - json
+              - union
+              - chat_completion_input
+              - completion_input
+              - agent_turn_input
+          required:
+          - type
+          type: object
+  - command: replaceProperties
+    reason: Replace return type properties with better model (see above)
+    args:
+      filter:
+        only:
+        - $.components.schemas.ScoringFn.properties.return_type
+        - $.components.schemas.RegisterScoringFunctionRequest.properties.return_type
+      value:
+        $ref: '#/components/schemas/ReturnType'
+  - command: oneOfToAnyOf
+    reason: Prism (mock server) doesn't like one of our requests as it technically
+      matches multiple variants
+readme:
+  example_requests:
+    default:
+      type: request
+      endpoint: post /v1/chat/completions
+      params: {}
+    headline:
+      type: request
+      endpoint: get /v1/models
+      params: {}
+    pagination:
+      type: request
+      endpoint: post /v1/chat/completions
+      params: {}
+resources:
+  $shared:
+    models:
+      interleaved_content_item: InterleavedContentItem
+      interleaved_content: InterleavedContent
+      param_type: ParamType
+      safety_violation: SafetyViolation
+      sampling_params: SamplingParams
+      scoring_result: ScoringResult
+      system_message: SystemMessage
+  toolgroups:
+    models:
+      tool_group: ToolGroup
+      list_tool_groups_response: ListToolGroupsResponse
+    methods:
+      register: post /v1/toolgroups
+      get: get /v1/toolgroups/{toolgroup_id}
+      list: get /v1/toolgroups
+      unregister: delete /v1/toolgroups/{toolgroup_id}
+  tools:
+    methods:
+      get: get /v1/tools/{tool_name}
+      list:
+        paginated: false
+        endpoint: get /v1/tools
+  tool_runtime:
+    models:
+      tool_def: ToolDef
+      tool_invocation_result: ToolInvocationResult
+    methods:
+      list_tools:
+        paginated: false
+        endpoint: get /v1/tool-runtime/list-tools
+      invoke_tool: post /v1/tool-runtime/invoke
+  responses:
+    models:
+      response_object_stream: OpenAIResponseObjectStream
+      response_object: OpenAIResponseObject
+    methods:
+      create:
+        type: http
+        streaming:
+          stream_event_model: responses.response_object_stream
+          param_discriminator: stream
+        endpoint: post /v1/responses
+      retrieve: get /v1/responses/{response_id}
+      list:
+        type: http
+        endpoint: get /v1/responses
+      delete:
+        type: http
+        endpoint: delete /v1/responses/{response_id}
+    subresources:
+      input_items:
+        methods:
+          list:
+            type: http
+            paginated: false
+            endpoint: get /v1/responses/{response_id}/input_items
+  prompts:
+    models:
+      prompt: Prompt
+      list_prompts_response: ListPromptsResponse
+    methods:
+      create: post /v1/prompts
+      list:
+        paginated: false
+        endpoint: get /v1/prompts
+      retrieve: get /v1/prompts/{prompt_id}
+      update: post /v1/prompts/{prompt_id}
+      delete: delete /v1/prompts/{prompt_id}
+      set_default_version: post /v1/prompts/{prompt_id}/set-default-version
+    subresources:
+      versions:
+        methods:
+          list:
+            paginated: false
+            endpoint: get /v1/prompts/{prompt_id}/versions
+  conversations:
+    models:
+      conversation_object: Conversation
+    methods:
+      create:
+        type: http
+        endpoint: post /v1/conversations
+      retrieve: get /v1/conversations/{conversation_id}
+      update:
+        type: http
+        endpoint: post /v1/conversations/{conversation_id}
+      delete:
+        type: http
+        endpoint: delete /v1/conversations/{conversation_id}
+    subresources:
+      items:
+        methods:
+          get:
+            type: http
+            endpoint: get /v1/conversations/{conversation_id}/items/{item_id}
+          list:
+            type: http
+            endpoint: get /v1/conversations/{conversation_id}/items
+          create:
+            type: http
+            endpoint: post /v1/conversations/{conversation_id}/items
+          delete:
+            type: http
+            endpoint: delete /v1/conversations/{conversation_id}/items/{item_id}
+  inspect:
+    models:
+      healthInfo: HealthInfo
+      providerInfo: ProviderInfo
+      routeInfo: RouteInfo
+      versionInfo: VersionInfo
+    methods:
+      health: get /v1/health
+      version: get /v1/version
+  embeddings:
+    models:
+      create_embeddings_response: OpenAIEmbeddingsResponse
+    methods:
+      create: post /v1/embeddings
+  chat:
+    models:
+      chat_completion_chunk: OpenAIChatCompletionChunk
+    subresources:
+      completions:
+        methods:
+          create:
+            type: http
+            streaming:
+              stream_event_model: chat.chat_completion_chunk
+              param_discriminator: stream
+            endpoint: post /v1/chat/completions
+          list:
+            type: http
+            paginated: false
+            endpoint: get /v1/chat/completions
+          retrieve:
+            type: http
+            endpoint: get /v1/chat/completions/{completion_id}
+  completions:
+    methods:
+      create:
+        type: http
+        streaming:
+          param_discriminator: stream
+        endpoint: post /v1/completions
+  vector_io:
+    models:
+      queryChunksResponse: QueryChunksResponse
+    methods:
+      insert: post /v1/vector-io/insert
+      query: post /v1/vector-io/query
+  vector_stores:
+    models:
+      vector_store: VectorStoreObject
+      list_vector_stores_response: VectorStoreListResponse
+      vector_store_delete_response: VectorStoreDeleteResponse
+      vector_store_search_response: VectorStoreSearchResponsePage
+    methods:
+      create: post /v1/vector_stores
+      list: get /v1/vector_stores
+      retrieve: get /v1/vector_stores/{vector_store_id}
+      update: post /v1/vector_stores/{vector_store_id}
+      delete: delete /v1/vector_stores/{vector_store_id}
+      search: post /v1/vector_stores/{vector_store_id}/search
+    subresources:
+      files:
+        models:
+          vector_store_file: VectorStoreFileObject
+        methods:
+          list: get /v1/vector_stores/{vector_store_id}/files
+          retrieve: get /v1/vector_stores/{vector_store_id}/files/{file_id}
+          update: post /v1/vector_stores/{vector_store_id}/files/{file_id}
+          delete: delete /v1/vector_stores/{vector_store_id}/files/{file_id}
+          create: post /v1/vector_stores/{vector_store_id}/files
+          content: get /v1/vector_stores/{vector_store_id}/files/{file_id}/content
+      file_batches:
+        models:
+          vector_store_file_batches: VectorStoreFileBatchObject
+          list_vector_store_files_in_batch_response: VectorStoreFilesListInBatchResponse
+        methods:
+          create: post /v1/vector_stores/{vector_store_id}/file_batches
+          retrieve: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}
+          list_files: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/files
+          cancel: post /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel
+  models:
+    models:
+      model: OpenAIModel
+      list_models_response: OpenAIListModelsResponse
+    methods:
+      list:
+        paginated: false
+        endpoint: get /v1/models
+      retrieve: get /v1/models/{model_id}
+      register: post /v1/models
+      unregister: delete /v1/models/{model_id}
+    subresources:
+      openai:
+        methods:
+          list:
+            paginated: false
+            endpoint: get /v1/models
+  providers:
+    models:
+      list_providers_response: ListProvidersResponse
+    methods:
+      list:
+        paginated: false
+        endpoint: get /v1/providers
+      retrieve: get /v1/providers/{provider_id}
+  routes:
+    models:
+      list_routes_response: ListRoutesResponse
+    methods:
+      list:
+        paginated: false
+        endpoint: get /v1/inspect/routes
+  moderations:
+    models:
+      create_response: ModerationObject
+    methods:
+      create: post /v1/moderations
+  safety:
+    models:
+      run_shield_response: RunShieldResponse
+    methods:
+      run_shield: post /v1/safety/run-shield
+  shields:
+    models:
+      shield: Shield
+      list_shields_response: ListShieldsResponse
+    methods:
+      retrieve: get /v1/shields/{identifier}
+      list:
+        paginated: false
+        endpoint: get /v1/shields
+      register: post /v1/shields
+      delete: delete /v1/shields/{identifier}
+  scoring:
+    methods:
+      score: post /v1/scoring/score
+      score_batch: post /v1/scoring/score-batch
+  scoring_functions:
+    models:
+      scoring_fn: ScoringFn
+      scoring_fn_params: ScoringFnParams
+      list_scoring_functions_response: ListScoringFunctionsResponse
+    methods:
+      retrieve: get /v1/scoring-functions/{scoring_fn_id}
+      list:
+        paginated: false
+        endpoint: get /v1/scoring-functions
+      register: post /v1/scoring-functions
+      unregister: delete /v1/scoring-functions/{scoring_fn_id}
+  files:
+    models:
+      file: OpenAIFileObject
+      list_files_response: ListOpenAIFileResponse
+      delete_file_response: OpenAIFileDeleteResponse
+    methods:
+      create: post /v1/files
+      list: get /v1/files
+      retrieve: get /v1/files/{file_id}
+      delete: delete /v1/files/{file_id}
+      content: get /v1/files/{file_id}/content
+  batches:
+    methods:
+      create: post /v1/batches
+      list: get /v1/batches
+      retrieve: get /v1/batches/{batch_id}
+      cancel: post /v1/batches/{batch_id}/cancel
+  alpha:
+    subresources:
+      inference:
+        methods:
+          rerank: post /v1alpha/inference/rerank
+      post_training:
+        models:
+          algorithm_config: AlgorithmConfig
+          post_training_job: PostTrainingJob
+          list_post_training_jobs_response: ListPostTrainingJobsResponse
+        methods:
+          preference_optimize: post /v1alpha/post-training/preference-optimize
+          supervised_fine_tune: post /v1alpha/post-training/supervised-fine-tune
+        subresources:
+          job:
+            methods:
+              artifacts: get /v1alpha/post-training/job/artifacts
+              cancel: post /v1alpha/post-training/job/cancel
+              status: get /v1alpha/post-training/job/status
+              list:
+                paginated: false
+                endpoint: get /v1alpha/post-training/jobs
+      benchmarks:
+        models:
+          benchmark: Benchmark
+          list_benchmarks_response: ListBenchmarksResponse
+        methods:
+          retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}
+          list:
+            paginated: false
+            endpoint: get /v1alpha/eval/benchmarks
+          register: post /v1alpha/eval/benchmarks
+          unregister: delete /v1alpha/eval/benchmarks/{benchmark_id}
+      eval:
+        models:
+          evaluate_response: EvaluateResponse
+          benchmark_config: BenchmarkConfig
+          job: Job
+        methods:
+          evaluate_rows: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
+          run_eval: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
+          evaluate_rows_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
+          run_eval_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
+        subresources:
+          jobs:
+            methods:
+              cancel: delete /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
+              status: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
+              retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result
+  beta:
+    subresources:
+      datasets:
+        models:
+          list_datasets_response: ListDatasetsResponse
+        methods:
+          register: post /v1beta/datasets
+          retrieve: get /v1beta/datasets/{dataset_id}
+          list:
+            paginated: false
+            endpoint: get /v1beta/datasets
+          unregister: delete /v1beta/datasets/{dataset_id}
+          iterrows: get /v1beta/datasetio/iterrows/{dataset_id}
+          appendrows: post /v1beta/datasetio/append-rows/{dataset_id}
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
--- a/containers/Containerfile
+++ b/containers/Containerfile
@ -0,0 +1,152 @@
+# syntax=docker/dockerfile:1.6
+#
+# This Dockerfile is used to build the Llama Stack container image.
+# Example:
+# docker build \
+#   -f containers/Containerfile \
+#   --build-arg DISTRO_NAME=starter \
+#   --tag llama-stack:starter .
+
+ARG BASE_IMAGE=python:3.12-slim
+FROM ${BASE_IMAGE}
+
+ARG INSTALL_MODE="pypi"
+ARG LLAMA_STACK_DIR="/workspace"
+ARG LLAMA_STACK_CLIENT_DIR=""
+ARG PYPI_VERSION=""
+ARG TEST_PYPI_VERSION=""
+ARG KEEP_WORKSPACE=""
+ARG DISTRO_NAME="starter"
+ARG RUN_CONFIG_PATH=""
+ARG UV_HTTP_TIMEOUT=500
+ARG UV_EXTRA_INDEX_URL=""
+ARG UV_INDEX_STRATEGY=""
+ENV UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT}
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PIP_DISABLE_PIP_VERSION_CHECK=1
+WORKDIR /app
+
+RUN set -eux; \
+    if command -v dnf >/dev/null 2>&1; then \
+        dnf -y update && \
+        dnf install -y iputils git net-tools wget \
+            vim-minimal python3.12 python3.12-pip python3.12-wheel \
+            python3.12-setuptools python3.12-devel gcc gcc-c++ make && \
+        ln -sf /usr/bin/pip3.12 /usr/local/bin/pip && \
+        ln -sf /usr/bin/python3.12 /usr/local/bin/python && \
+        dnf clean all; \
+    elif command -v apt-get >/dev/null 2>&1; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            iputils-ping net-tools iproute2 dnsutils telnet \
+            curl wget git procps psmisc lsof traceroute bubblewrap \
+            gcc g++ && \
+        rm -rf /var/lib/apt/lists/*; \
+    else \
+        echo "Unsupported base image: expected dnf or apt-get" >&2; \
+        exit 1; \
+    fi
+
+RUN pip install --no-cache uv
+ENV UV_SYSTEM_PYTHON=1
+
+ENV INSTALL_MODE=${INSTALL_MODE}
+ENV LLAMA_STACK_DIR=${LLAMA_STACK_DIR}
+ENV LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR}
+ENV PYPI_VERSION=${PYPI_VERSION}
+ENV TEST_PYPI_VERSION=${TEST_PYPI_VERSION}
+ENV KEEP_WORKSPACE=${KEEP_WORKSPACE}
+ENV DISTRO_NAME=${DISTRO_NAME}
+ENV RUN_CONFIG_PATH=${RUN_CONFIG_PATH}
+
+# Copy the repository so editable installs and run configurations are available.
+COPY . /workspace
+
+# Install the client package if it is provided
+# NOTE: this is installed before llama-stack since llama-stack depends on llama-stack-client-python
+# Unset UV index env vars to ensure we only use PyPI for the client
+RUN set -eux; \
+    unset UV_EXTRA_INDEX_URL UV_INDEX_STRATEGY; \
+    if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then \
+        if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then \
+            echo "LLAMA_STACK_CLIENT_DIR is set but $LLAMA_STACK_CLIENT_DIR does not exist" >&2; \
+            exit 1; \
+        fi; \
+        uv pip install --no-cache -e "$LLAMA_STACK_CLIENT_DIR"; \
+    fi;
+
+# Install llama-stack
+# Use UV_EXTRA_INDEX_URL inline only for editable install with RC dependencies
+RUN set -eux; \
+    SAVED_UV_EXTRA_INDEX_URL="${UV_EXTRA_INDEX_URL:-}"; \
+    SAVED_UV_INDEX_STRATEGY="${UV_INDEX_STRATEGY:-}"; \
+    unset UV_EXTRA_INDEX_URL UV_INDEX_STRATEGY; \
+    if [ "$INSTALL_MODE" = "editable" ]; then \
+        if [ ! -d "$LLAMA_STACK_DIR" ]; then \
+            echo "INSTALL_MODE=editable requires LLAMA_STACK_DIR to point to a directory inside the build context" >&2; \
+            exit 1; \
+        fi; \
+        if [ -n "$SAVED_UV_EXTRA_INDEX_URL" ] && [ -n "$SAVED_UV_INDEX_STRATEGY" ]; then \
+            UV_EXTRA_INDEX_URL="$SAVED_UV_EXTRA_INDEX_URL" UV_INDEX_STRATEGY="$SAVED_UV_INDEX_STRATEGY" \
+                uv pip install --no-cache -e "$LLAMA_STACK_DIR"; \
+        else \
+            uv pip install --no-cache -e "$LLAMA_STACK_DIR"; \
+        fi; \
+    elif [ "$INSTALL_MODE" = "test-pypi" ]; then \
+        uv pip install --no-cache fastapi libcst; \
+        if [ -n "$TEST_PYPI_VERSION" ]; then \
+            uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match "llama-stack==$TEST_PYPI_VERSION"; \
+        else \
+            uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match llama-stack; \
+        fi; \
+    else \
+        if [ -n "$PYPI_VERSION" ]; then \
+            uv pip install --no-cache "llama-stack==$PYPI_VERSION"; \
+        else \
+            uv pip install --no-cache llama-stack; \
+        fi; \
+    fi;
+
+# Install the dependencies for the distribution
+# Explicitly unset UV index env vars to ensure we only use PyPI for distribution deps
+RUN set -eux; \
+    unset UV_EXTRA_INDEX_URL UV_INDEX_STRATEGY; \
+    if [ -z "$DISTRO_NAME" ]; then \
+        echo "DISTRO_NAME must be provided" >&2; \
+        exit 1; \
+    fi; \
+    deps="$(llama stack list-deps "$DISTRO_NAME")"; \
+    if [ -n "$deps" ]; then \
+        printf '%s\n' "$deps" | xargs -L1 uv pip install --no-cache; \
+    fi
+
+# Cleanup
+RUN set -eux; \
+    pip uninstall -y uv; \
+    should_remove=1; \
+    if [ -n "$KEEP_WORKSPACE" ]; then should_remove=0; fi; \
+    if [ "$INSTALL_MODE" = "editable" ]; then should_remove=0; fi; \
+    case "$RUN_CONFIG_PATH" in \
+        /workspace*) should_remove=0 ;; \
+    esac; \
+    if [ "$should_remove" -eq 1 ] && [ -d /workspace ]; then rm -rf /workspace; fi
+
+RUN cat <<'EOF' >/usr/local/bin/llama-stack-entrypoint.sh
+#!/bin/sh
+set -e
+
+if [ -n "$RUN_CONFIG_PATH" ] && [ -f "$RUN_CONFIG_PATH" ]; then
+  exec llama stack run "$RUN_CONFIG_PATH" "$@"
+fi
+
+if [ -n "$DISTRO_NAME" ]; then
+  exec llama stack run "$DISTRO_NAME" "$@"
+fi
+
+exec llama stack run "$@"
+EOF
+RUN chmod +x /usr/local/bin/llama-stack-entrypoint.sh
+
+RUN mkdir -p /.llama /.cache && chmod -R g+rw /app /.llama /.cache
+
+ENTRYPOINT ["/usr/local/bin/llama-stack-entrypoint.sh"]
--- a/docs/README.md
+++ b/docs/README.md
@ -13,6 +13,42 @@ npm run serve
 ```
 You can open up the docs in your browser at http://localhost:3000

+## File Import System
+
+This documentation uses `remark-code-import` to import files directly from the repository, eliminating copy-paste maintenance. Files are automatically embedded during build time.
+
+### Importing Code Files
+
+To import Python code (or any code files) with syntax highlighting, use this syntax in `.mdx` files:
+
+```markdown
+```python file=./demo_script.py title="demo_script.py"
+```
+```
+
+This automatically imports the file content and displays it as a formatted code block with Python syntax highlighting.
+
+**Note:** Paths are relative to the current `.mdx` file location, not the repository root.
+
+### Importing Markdown Files as Content
+
+For importing and rendering markdown files (like CONTRIBUTING.md), use the raw-loader approach:
+
+```jsx
+import Contributing from '!!raw-loader!../../../CONTRIBUTING.md';
+import ReactMarkdown from 'react-markdown';
+
+<ReactMarkdown>{Contributing}</ReactMarkdown>
+```
+
+**Requirements:**
+- Install dependencies: `npm install --save-dev raw-loader react-markdown`
+
+**Path Resolution:**
+- For `remark-code-import`: Paths are relative to the current `.mdx` file location
+- For `raw-loader`: Paths are relative to the current `.mdx` file location
+- Use `../` to navigate up directories as needed
+
 ## Content

 Try out Llama Stack's capabilities through our detailed Jupyter notebooks:
--- a/docs/docs/advanced_apis/post_training.mdx
+++ b/docs/docs/advanced_apis/post_training.mdx
@ -51,8 +51,8 @@ device: cpu
 You can access the HuggingFace trainer via the `starter` distribution:

 ```bash
-llama stack build --distro starter --image-type venv
-llama stack run ~/.llama/distributions/starter/starter-run.yaml
+llama stack list-deps starter | xargs -L1 uv pip install
+llama stack run starter
 ```

 ### Usage Example
--- a/docs/docs/api-deprecated/index.mdx
+++ b/docs/docs/api-deprecated/index.mdx
@ -0,0 +1,62 @@
+---
+title: Deprecated APIs
+description: Legacy APIs that are being phased out
+sidebar_label: Deprecated
+sidebar_position: 1
+---
+
+# Deprecated APIs
+
+This section contains APIs that are being phased out in favor of newer, more standardized implementations. These APIs are maintained for backward compatibility but are not recommended for new projects.
+
+:::warning Deprecation Notice
+These APIs are deprecated and will be removed in future versions. Please migrate to the recommended alternatives listed below.
+:::
+
+## Migration Guide
+
+When using deprecated APIs, please refer to the migration guides provided for each API to understand how to transition to the supported alternatives.
+
+## Deprecated API List
+
+### Legacy Inference APIs
+Some older inference endpoints that have been superseded by the standardized Inference API.
+
+**Migration Path:** Use the [Inference API](../api/) instead.
+
+### Legacy Vector Operations
+Older vector database operations that have been replaced by the Vector IO API.
+
+**Migration Path:** Use the [Vector IO API](../api/) instead.
+
+### Legacy File Operations
+Older file management endpoints that have been replaced by the Files API.
+
+**Migration Path:** Use the [Files API](../api/) instead.
+
+## Support Timeline
+
+Deprecated APIs will be supported according to the following timeline:
+
+- **Current Version**: Full support with deprecation warnings
+- **Next Major Version**: Limited support with migration notices
+- **Following Major Version**: Removal of deprecated APIs
+
+## Getting Help
+
+If you need assistance migrating from deprecated APIs:
+
+1. Check the specific migration guides for each API
+2. Review the [API Reference](../api/) for current alternatives
+3. Consult the [Community Forums](https://github.com/llamastack/llama-stack/discussions) for migration support
+4. Open an issue on GitHub for specific migration questions
+
+## Contributing
+
+If you find issues with deprecated APIs or have suggestions for improving the migration process, please contribute by:
+
+1. Opening an issue describing the problem
+2. Submitting a pull request with improvements
+3. Updating migration documentation
+
+For more information on contributing, see our [Contributing Guide](../contributing/).
--- a/docs/docs/api-experimental/index.mdx
+++ b/docs/docs/api-experimental/index.mdx
@ -0,0 +1,128 @@
+---
+title: Experimental APIs
+description: APIs in development with limited support
+sidebar_label: Experimental
+sidebar_position: 1
+---
+
+# Experimental APIs
+
+This section contains APIs that are currently in development and may have limited support or stability. These APIs are available for testing and feedback but should not be used in production environments.
+
+:::warning Experimental Notice
+These APIs are experimental and may change without notice. Use with caution and provide feedback to help improve them.
+:::
+
+## Current Experimental APIs
+
+### Batch Inference API
+Run inference on a dataset of inputs in batch mode for improved efficiency.
+
+**Status:** In Development
+**Provider Support:** Limited
+**Use Case:** Large-scale inference operations
+
+**Features:**
+- Batch processing of multiple inputs
+- Optimized resource utilization
+- Progress tracking and monitoring
+
+### Batch Agents API
+Run agentic workflows on a dataset of inputs in batch mode.
+
+**Status:** In Development
+**Provider Support:** Limited
+**Use Case:** Large-scale agent operations
+
+**Features:**
+- Batch agent execution
+- Parallel processing capabilities
+- Result aggregation and analysis
+
+### Synthetic Data Generation API
+Generate synthetic data for model development and testing.
+
+**Status:** Early Development
+**Provider Support:** Very Limited
+**Use Case:** Training data augmentation
+
+**Features:**
+- Automated data generation
+- Quality control mechanisms
+- Customizable generation parameters
+
+### Batches API (OpenAI-compatible)
+OpenAI-compatible batch management for inference operations.
+
+**Status:** In Development
+**Provider Support:** Limited
+**Use Case:** OpenAI batch processing compatibility
+
+**Features:**
+- OpenAI batch API compatibility
+- Job scheduling and management
+- Status tracking and monitoring
+
+## Getting Started with Experimental APIs
+
+### Prerequisites
+- Llama Stack server running with experimental features enabled
+- Appropriate provider configurations
+- Understanding of API limitations
+
+### Configuration
+Experimental APIs may require special configuration flags or provider settings. Check the specific API documentation for setup requirements.
+
+### Usage Guidelines
+1. **Testing Only**: Use experimental APIs for testing and development only
+2. **Monitor Changes**: Watch for updates and breaking changes
+3. **Provide Feedback**: Report issues and suggest improvements
+4. **Backup Data**: Always backup important data when using experimental features
+
+## Feedback and Contribution
+
+We encourage feedback on experimental APIs to help improve them:
+
+### Reporting Issues
+- Use GitHub issues with the "experimental" label
+- Include detailed error messages and reproduction steps
+- Specify the API version and provider being used
+
+### Feature Requests
+- Submit feature requests through GitHub discussions
+- Provide use cases and expected behavior
+- Consider contributing implementations
+
+### Testing
+- Test experimental APIs in your environment
+- Report performance issues and optimization opportunities
+- Share success stories and use cases
+
+## Migration to Stable APIs
+
+As experimental APIs mature, they will be moved to the stable API section. When this happens:
+
+1. **Announcement**: We'll announce the promotion in release notes
+2. **Migration Guide**: Detailed migration instructions will be provided
+3. **Deprecation Timeline**: Experimental versions will be deprecated with notice
+4. **Support**: Full support will be available for stable versions
+
+## Provider Support
+
+Experimental APIs may have limited provider support. Check the specific API documentation for:
+
+- Supported providers
+- Configuration requirements
+- Known limitations
+- Performance characteristics
+
+## Roadmap
+
+Experimental APIs are part of our ongoing development roadmap:
+
+- **Q1 2024**: Batch Inference API stabilization
+- **Q2 2024**: Batch Agents API improvements
+- **Q3 2024**: Synthetic Data Generation API expansion
+- **Q4 2024**: Batches API full OpenAI compatibility
+
+For the latest updates, follow our [GitHub releases](https://github.com/llamastack/llama-stack/releases) and [roadmap discussions](https://github.com/llamastack/llama-stack/discussions).
--- a/docs/docs/api-openai/index.mdx
+++ b/docs/docs/api-openai/index.mdx
@ -0,0 +1,287 @@
+---
+title: OpenAI API Compatibility
+description: OpenAI-compatible APIs and features in Llama Stack
+sidebar_label: OpenAI Compatibility
+sidebar_position: 1
+---
+
+# OpenAI API Compatibility
+
+Llama Stack provides comprehensive OpenAI API compatibility, allowing you to use existing OpenAI API clients and tools with Llama Stack providers. This compatibility layer ensures seamless migration and interoperability.
+
+## Overview
+
+OpenAI API compatibility in Llama Stack includes:
+
+- **OpenAI-compatible endpoints** for all major APIs
+- **Request/response format compatibility** with OpenAI standards
+- **Authentication and authorization** using OpenAI-style API keys
+- **Error handling** with OpenAI-compatible error codes and messages
+- **Rate limiting** and usage tracking compatible with OpenAI patterns
+
+## Supported OpenAI APIs
+
+### Chat Completions API
+OpenAI-compatible chat completions for conversational AI applications.
+
+**Endpoint:** `/v1/chat/completions`
+**Compatibility:** Full OpenAI API compatibility
+**Providers:** All inference providers
+
+**Features:**
+- Message-based conversations
+- System prompts and user messages
+- Function calling support
+- Streaming responses
+- Temperature and other parameter controls
+
+### Completions API
+OpenAI-compatible text completions for general text generation.
+
+**Endpoint:** `/v1/completions`
+**Compatibility:** Full OpenAI API compatibility
+**Providers:** All inference providers
+
+**Features:**
+- Text completion generation
+- Prompt engineering support
+- Customizable parameters
+- Batch processing capabilities
+
+### Embeddings API
+OpenAI-compatible embeddings for vector operations.
+
+**Endpoint:** `/v1/embeddings`
+**Compatibility:** Full OpenAI API compatibility
+**Providers:** All embedding providers
+
+**Features:**
+- Text embedding generation
+- Multiple embedding models
+- Batch embedding processing
+- Vector similarity operations
+
+### Files API
+OpenAI-compatible file management for document processing.
+
+**Endpoint:** `/v1/files`
+**Compatibility:** Full OpenAI API compatibility
+**Providers:** Local Filesystem, S3
+
+**Features:**
+- File upload and management
+- Document processing
+- File metadata tracking
+- Secure file access
+
+### Vector Store Files API
+OpenAI-compatible vector store file operations for RAG applications.
+
+**Endpoint:** `/v1/vector_stores/{vector_store_id}/files`
+**Compatibility:** Full OpenAI API compatibility
+**Providers:** FAISS, SQLite-vec, Milvus, ChromaDB, Qdrant, Weaviate, Postgres (PGVector)
+
+**Features:**
+- Automatic document processing
+- Vector store integration
+- File chunking and indexing
+- Search and retrieval operations
+
+### Batches API
+OpenAI-compatible batch processing for large-scale operations.
+
+**Endpoint:** `/v1/batches`
+**Compatibility:** OpenAI API compatibility (experimental)
+**Providers:** Limited support
+
+**Features:**
+- Batch job creation and management
+- Progress tracking
+- Result retrieval
+- Error handling
+
+## Migration from OpenAI
+
+### Step 1: Update API Endpoint
+Change your API endpoint from OpenAI to your Llama Stack server:
+
+```python
+# Before (OpenAI)
+import openai
+client = openai.OpenAI(api_key="your-openai-key")
+
+# After (Llama Stack)
+import openai
+client = openai.OpenAI(
+    api_key="your-llama-stack-key",
+    base_url="http://localhost:8000/v1"  # Your Llama Stack server
+)
+```
+
+### Step 2: Configure Providers
+Set up your preferred providers in the Llama Stack configuration:
+
+```yaml
+# stack-config.yaml
+inference:
+  providers:
+    - name: "meta-reference"
+      type: "inline"
+      model: "llama-3.1-8b"
+```
+
+### Step 3: Test Compatibility
+Verify that your existing code works with Llama Stack:
+
+```python
+# Test chat completions
+response = client.chat.completions.create(
+    model="llama-3.1-8b",
+    messages=[
+        {"role": "user", "content": "Hello, world!"}
+    ]
+)
+print(response.choices[0].message.content)
+```
+
+## Provider-Specific Features
+
+### Meta Reference Provider
+- Full OpenAI API compatibility
+- Local model execution
+- Custom model support
+
+### Remote Providers
+- OpenAI API compatibility
+- Cloud-based execution
+- Scalable infrastructure
+
+### Vector Store Providers
+- OpenAI vector store API compatibility
+- Automatic document processing
+- Advanced search capabilities
+
+## Authentication
+
+Llama Stack supports OpenAI-style authentication:
+
+### API Key Authentication
+```python
+client = openai.OpenAI(
+    api_key="your-api-key",
+    base_url="http://localhost:8000/v1"
+)
+```
+
+### Environment Variables
+```bash
+export OPENAI_API_KEY="your-api-key"
+export OPENAI_BASE_URL="http://localhost:8000/v1"
+```
+
+## Error Handling
+
+Llama Stack provides OpenAI-compatible error responses:
+
+```python
+try:
+    response = client.chat.completions.create(...)
+except openai.APIError as e:
+    print(f"API Error: {e}")
+except openai.RateLimitError as e:
+    print(f"Rate Limit Error: {e}")
+except openai.APIConnectionError as e:
+    print(f"Connection Error: {e}")
+```
+
+## Rate Limiting
+
+OpenAI-compatible rate limiting is supported:
+
+- **Requests per minute** limits
+- **Tokens per minute** limits
+- **Concurrent request** limits
+- **Usage tracking** and monitoring
+
+## Monitoring and Observability
+
+Track your API usage with OpenAI-compatible monitoring:
+
+- **Request/response logging**
+- **Usage metrics** and analytics
+- **Performance monitoring**
+- **Error tracking** and alerting
+
+## Best Practices
+
+### 1. Provider Selection
+Choose providers based on your requirements:
+- **Local development**: Meta Reference, Ollama
+- **Production**: Cloud providers (Fireworks, Together, NVIDIA)
+- **Specialized use cases**: Custom providers
+
+### 2. Model Configuration
+Configure models for optimal performance:
+- **Model selection** based on task requirements
+- **Parameter tuning** for specific use cases
+- **Resource allocation** for performance
+
+### 3. Error Handling
+Implement robust error handling:
+- **Retry logic** for transient failures
+- **Fallback providers** for high availability
+- **Monitoring** and alerting for issues
+
+### 4. Security
+Follow security best practices:
+- **API key management** and rotation
+- **Access control** and authorization
+- **Data privacy** and compliance
+
+## Implementation Examples
+
+For detailed code examples and implementation guides, see our [OpenAI Implementation Guide](../providers/openai.mdx).
+
+## Known Limitations
+
+### Responses API Limitations
+The Responses API is still in active development. For detailed information about current limitations and implementation status, see our [OpenAI Responses API Limitations](../providers/openai_responses_limitations.mdx).
+
+## Troubleshooting
+
+### Common Issues
+
+**Connection Errors**
+- Verify server is running
+- Check network connectivity
+- Validate API endpoint URL
+
+**Authentication Errors**
+- Verify API key is correct
+- Check key permissions
+- Ensure proper authentication headers
+
+**Model Errors**
+- Verify model is available
+- Check provider configuration
+- Validate model parameters
+
+### Getting Help
+
+For OpenAI compatibility issues:
+
+1. **Check Documentation**: Review provider-specific documentation
+2. **Community Support**: Ask questions in GitHub discussions
+3. **Issue Reporting**: Open GitHub issues for bugs
+4. **Professional Support**: Contact support for enterprise issues
+
+## Roadmap
+
+Upcoming OpenAI compatibility features:
+
+- **Enhanced batch processing** support
+- **Advanced function calling** capabilities
+- **Improved error handling** and diagnostics
+- **Performance optimizations** for large-scale deployments
+
+For the latest updates, follow our [GitHub releases](https://github.com/llamastack/llama-stack/releases) and [roadmap discussions](https://github.com/llamastack/llama-stack/discussions).
--- a/docs/docs/api/index.mdx
+++ b/docs/docs/api/index.mdx
@ -0,0 +1,144 @@
+---
+title: API Reference
+description: Complete reference for Llama Stack APIs
+sidebar_label: Overview
+sidebar_position: 1
+---
+
+# API Reference
+
+Llama Stack provides a comprehensive set of APIs for building generative AI applications. All APIs follow OpenAI-compatible standards and can be used interchangeably across different providers.
+
+## Core APIs
+
+### Inference API
+Run inference with Large Language Models (LLMs) and embedding models.
+
+**Supported Providers:**
+- Meta Reference (Single Node)
+- Ollama (Single Node)
+- Fireworks (Hosted)
+- Together (Hosted)
+- NVIDIA NIM (Hosted and Single Node)
+- vLLM (Hosted and Single Node)
+- TGI (Hosted and Single Node)
+- AWS Bedrock (Hosted)
+- Cerebras (Hosted)
+- Groq (Hosted)
+- SambaNova (Hosted)
+- PyTorch ExecuTorch (On-device iOS, Android)
+- OpenAI (Hosted)
+- Anthropic (Hosted)
+- Gemini (Hosted)
+- WatsonX (Hosted)
+
+### Agents API
+Run multi-step agentic workflows with LLMs, including tool usage, memory (RAG), and complex reasoning.
+
+**Supported Providers:**
+- Meta Reference (Single Node)
+- Fireworks (Hosted)
+- Together (Hosted)
+- PyTorch ExecuTorch (On-device iOS)
+
+### Vector IO API
+Perform operations on vector stores, including adding documents, searching, and deleting documents.
+
+**Supported Providers:**
+- FAISS (Single Node)
+- SQLite-Vec (Single Node)
+- Chroma (Hosted and Single Node)
+- Milvus (Hosted and Single Node)
+- Postgres (PGVector) (Hosted and Single Node)
+- Weaviate (Hosted)
+- Qdrant (Hosted and Single Node)
+
+### Files API (OpenAI-compatible)
+Manage file uploads, storage, and retrieval with OpenAI-compatible endpoints.
+
+**Supported Providers:**
+- Local Filesystem (Single Node)
+- S3 (Hosted)
+
+### Vector Store Files API (OpenAI-compatible)
+Integrate file operations with vector stores for automatic document processing and search.
+
+**Supported Providers:**
+- FAISS (Single Node)
+- SQLite-vec (Single Node)
+- Milvus (Single Node)
+- ChromaDB (Hosted and Single Node)
+- Qdrant (Hosted and Single Node)
+- Weaviate (Hosted)
+- Postgres (PGVector) (Hosted and Single Node)
+
+### Safety API
+Apply safety policies to outputs at a systems level, not just model level.
+
+**Supported Providers:**
+- Llama Guard (Depends on Inference Provider)
+- Prompt Guard (Single Node)
+- Code Scanner (Single Node)
+- AWS Bedrock (Hosted)
+
+### Post Training API
+Fine-tune models for specific use cases and domains.
+
+**Supported Providers:**
+- Meta Reference (Single Node)
+- HuggingFace (Single Node)
+- TorchTune (Single Node)
+- NVIDIA NEMO (Hosted)
+
+### Eval API
+Generate outputs and perform scoring to evaluate system performance.
+
+**Supported Providers:**
+- Meta Reference (Single Node)
+- NVIDIA NEMO (Hosted)
+
+### Telemetry API
+Collect telemetry data from the system for monitoring and observability.
+
+**Supported Providers:**
+- Meta Reference (Single Node)
+
+### Tool Runtime API
+Interact with various tools and protocols to extend LLM capabilities.
+
+**Supported Providers:**
+- Brave Search (Hosted)
+- RAG Runtime (Single Node)
+
+## API Compatibility
+
+All Llama Stack APIs are designed to be OpenAI-compatible, allowing you to:
+- Use existing OpenAI API clients and tools
+- Migrate from OpenAI to other providers seamlessly
+- Maintain consistent API contracts across different environments
+
+## Getting Started
+
+To get started with Llama Stack APIs:
+
+1. **Choose a Distribution**: Select a pre-configured distribution that matches your environment
+2. **Configure Providers**: Set up the providers you want to use for each API
+3. **Start the Server**: Launch the Llama Stack server with your configuration
+4. **Use the APIs**: Make requests to the API endpoints using your preferred client
+
+For detailed setup instructions, see our [Getting Started Guide](../getting_started/quickstart).
+
+## Provider Details
+
+For complete provider compatibility and setup instructions, see our [Providers Documentation](../providers/).
+
+## API Stability
+
+Llama Stack APIs are organized by stability level:
+- **[Stable APIs](./index.mdx)** - Production-ready APIs with full support
+- **[Experimental APIs](../api-experimental/)** - APIs in development with limited support
+- **[Deprecated APIs](../api-deprecated/)** - Legacy APIs being phased out
+
+## OpenAI Integration
+
+For specific OpenAI API compatibility features, see our [OpenAI Compatibility Guide](../api-openai/).
--- a/docs/docs/building_applications/index.mdx
+++ b/docs/docs/building_applications/index.mdx
@ -35,9 +35,6 @@ Here are the key topics that will help you build effective AI applications:
 - **[Telemetry](./telemetry.mdx)** - Monitor and analyze your agents' performance and behavior
 - **[Safety](./safety.mdx)** - Implement guardrails and safety measures to ensure responsible AI behavior

-### 🎮 **Interactive Development**
- **[Playground](./playground.mdx)** - Interactive environment for testing and developing applications
-
 ## Application Patterns

 ### 🤖 **Conversational Agents**
--- a/docs/docs/building_applications/playground.mdx
+++ b/docs/docs/building_applications/playground.mdx
@ -1,299 +1,87 @@
 ---
-title: Llama Stack Playground
-description: Interactive interface to explore and experiment with Llama Stack capabilities
+title: Admin UI & Chat Playground
+description: Web-based admin interface and chat playground for Llama Stack
 sidebar_label: Playground
 sidebar_position: 10
 ---

-import Tabs from '@theme/Tabs';
-import TabItem from '@theme/TabItem';
+# Admin UI & Chat Playground

-# Llama Stack Playground
+The Llama Stack UI provides a comprehensive web-based admin interface for managing your Llama Stack server, with an integrated chat playground for interactive testing. This admin interface is the primary way to monitor, manage, and debug your Llama Stack applications.

-:::note[Experimental Feature]
-The Llama Stack Playground is currently experimental and subject to change. We welcome feedback and contributions to help improve it.
-:::
+## Quick Start

-The Llama Stack Playground is a simple interface that aims to:
- **Showcase capabilities and concepts** of Llama Stack in an interactive environment
- **Demo end-to-end application code** to help users get started building their own applications
- **Provide a UI** to help users inspect and understand Llama Stack API providers and resources
-
-## Key Features
-
-### Interactive Playground Pages
-
-The playground provides interactive pages for users to explore Llama Stack API capabilities:
-
-#### Chatbot Interface
-
-<video
-  controls
-  autoPlay
-  playsInline
-  muted
-  loop
-  style={{width: '100%'}}
->
-  <source src="https://github.com/user-attachments/assets/8d2ef802-5812-4a28-96e1-316038c84cbf" type="video/mp4" />
-  Your browser does not support the video tag.
-</video>
-
-<Tabs>
-<TabItem value="chat" label="Chat">
-
-**Simple Chat Interface**
- Chat directly with Llama models through an intuitive interface
- Uses the `/chat/completions` streaming API under the hood
- Real-time message streaming for responsive interactions
- Perfect for testing model capabilities and prompt engineering
-
-</TabItem>
-<TabItem value="rag" label="RAG Chat">
-
-**Document-Aware Conversations**
- Upload documents to create memory banks
- Chat with a RAG-enabled agent that can query your documents
- Uses Llama Stack's `/agents` API to create and manage RAG sessions
- Ideal for exploring knowledge-enhanced AI applications
-
-</TabItem>
-</Tabs>
-
-#### Evaluation Interface
-
-<video
-  controls
-  autoPlay
-  playsInline
-  muted
-  loop
-  style={{width: '100%'}}
->
-  <source src="https://github.com/user-attachments/assets/6cc1659f-eba4-49ca-a0a5-7c243557b4f5" type="video/mp4" />
-  Your browser does not support the video tag.
-</video>
-
-<Tabs>
-<TabItem value="scoring" label="Scoring Evaluations">
-
-**Custom Dataset Evaluation**
- Upload your own evaluation datasets
- Run evaluations using available scoring functions
- Uses Llama Stack's `/scoring` API for flexible evaluation workflows
- Great for testing application performance on custom metrics
-
-</TabItem>
-<TabItem value="benchmarks" label="Benchmark Evaluations">
-
-<video
-  controls
-  autoPlay
-  playsInline
-  muted
-  loop
-  style={{width: '100%', marginBottom: '1rem'}}
->
-  <source src="https://github.com/user-attachments/assets/345845c7-2a2b-4095-960a-9ae40f6a93cf" type="video/mp4" />
-  Your browser does not support the video tag.
-</video>
-
-**Pre-registered Evaluation Tasks**
- Evaluate models or agents on pre-defined tasks
- Uses Llama Stack's `/eval` API for comprehensive evaluation
- Combines datasets and scoring functions for standardized testing
-
-**Setup Requirements:**
-Register evaluation datasets and benchmarks first:
+Launch the admin UI with:

 ```bash
-# Register evaluation dataset
-llama-stack-client datasets register \
-  --dataset-id "mmlu" \
-  --provider-id "huggingface" \
-  --url "https://huggingface.co/datasets/llamastack/evals" \
-  --metadata '{"path": "llamastack/evals", "name": "evals__mmlu__details", "split": "train"}' \
-  --schema '{"input_query": {"type": "string"}, "expected_answer": {"type": "string"}, "chat_completion_input": {"type": "string"}}'
-
-# Register benchmark task
-llama-stack-client benchmarks register \
-  --eval-task-id meta-reference-mmlu \
-  --provider-id meta-reference \
-  --dataset-id mmlu \
-  --scoring-functions basic::regex_parser_multiple_choice_answer
+npx llama-stack-ui
 ```

-</TabItem>
-</Tabs>
+Then visit `http://localhost:8322` to access the interface.

-#### Inspection Interface
+## Admin Interface Features

-<video
-  controls
-  autoPlay
-  playsInline
-  muted
-  loop
-  style={{width: '100%'}}
->
-  <source src="https://github.com/user-attachments/assets/01d52b2d-92af-4e3a-b623-a9b8ba22ba99" type="video/mp4" />
-  Your browser does not support the video tag.
-</video>
+The Llama Stack UI is organized into three main sections:

-<Tabs>
-<TabItem value="providers" label="API Providers">
+### 🎯 Create
+**Chat Playground** - Interactive testing environment
+- Real-time chat interface for testing agents and models
+- Multi-turn conversations with tool calling support
+- Agent SDK integration (will be migrated to Responses API)
+- Custom system prompts and model parameter adjustment

-**Provider Management**
- Inspect available Llama Stack API providers
- View provider configurations and capabilities
- Uses the `/providers` API for real-time provider information
- Essential for understanding your deployment's capabilities
+### 📊 Manage
+**Logs & Resource Management** - Monitor and manage your stack
+- **Responses Logs**: View and analyze agent responses and interactions
+- **Chat Completions Logs**: Monitor chat completion requests and responses
+- **Vector Stores**: Create, manage, and monitor vector databases for RAG workflows
+- **Prompts**: Full CRUD operations for prompt templates and management
+- **Files**: Forthcoming file management capabilities

-</TabItem>
-<TabItem value="resources" label="API Resources">
+## Key Capabilities for Application Development

-**Resource Exploration**
- Inspect Llama Stack API resources including:
-  - **Models**: Available language models
-  - **Datasets**: Registered evaluation datasets
-  - **Memory Banks**: Vector databases and knowledge stores
-  - **Benchmarks**: Evaluation tasks and scoring functions
-  - **Shields**: Safety and content moderation tools
- Uses `/<resources>/list` APIs for comprehensive resource visibility
- For detailed information about resources, see [Core Concepts](/docs/concepts)
+### Real-time Monitoring
+- **Response Tracking**: Monitor all agent responses and tool calls
+- **Completion Analysis**: View chat completion performance and patterns
+- **Vector Store Activity**: Track RAG operations and document processing
+- **Prompt Usage**: Analyze prompt template performance

-</TabItem>
-</Tabs>
+### Resource Management
+- **Vector Store CRUD**: Create, update, and delete vector databases
+- **Prompt Library**: Organize and version control your prompts
+- **File Operations**: Manage documents and assets (forthcoming)
+
+### Interactive Testing
+- **Chat Playground**: Test conversational flows before production deployment
+- **Agent Prototyping**: Validate agent behaviors and tool integrations
+
+## Development Workflow Integration
+
+The admin UI supports your development lifecycle:
+
+1. **Development**: Use chat playground to prototype and test features
+2. **Monitoring**: Track system performance through logs and metrics
+3. **Management**: Organize prompts, vector stores, and other resources
+4. **Debugging**: Analyze logs to identify and resolve issues
+
+## Architecture Notes
+
+- **Current**: Chat playground uses Agents SDK
+- **Future**: Migration to Responses API for improved performance and consistency
+- **Admin Focus**: Primary emphasis on monitoring, logging, and resource management

 ## Getting Started

-### Quick Start Guide
+1. **Launch the UI**: Run `npx llama-stack-ui`
+2. **Explore Logs**: Start with Responses and Chat Completions logs to understand your system activity
+3. **Test in Playground**: Use the chat interface to validate your agent configurations
+4. **Manage Resources**: Create vector stores and organize prompts through the UI

-<Tabs>
-<TabItem value="setup" label="Setup">
+For detailed setup and configuration, see the [Llama Stack UI documentation](/docs/distributions/llama_stack_ui).

-**1. Start the Llama Stack API Server**
+## Next Steps

-```bash
-# Build and run a distribution (example: together)
-llama stack build --distro together --image-type venv
-llama stack run together
-```
-
-**2. Start the Streamlit UI**
-
-```bash
-# Launch the playground interface
-uv run --with ".[ui]" streamlit run llama_stack.core/ui/app.py
-```
-
-</TabItem>
-<TabItem value="usage" label="Usage Tips">
-
-**Making the Most of the Playground:**
-
- **Start with Chat**: Test basic model interactions and prompt engineering
- **Explore RAG**: Upload sample documents to see knowledge-enhanced responses
- **Try Evaluations**: Use the scoring interface to understand evaluation metrics
- **Inspect Resources**: Check what providers and resources are available
- **Experiment with Settings**: Adjust parameters to see how they affect results
-
-</TabItem>
-</Tabs>
-
-### Available Distributions
-
-The playground works with any Llama Stack distribution. Popular options include:
-
-<Tabs>
-<TabItem value="together" label="Together AI">
-
-```bash
-llama stack build --distro together --image-type venv
-llama stack run together
-```
-
-**Features:**
- Cloud-hosted models
- Fast inference
- Multiple model options
-
-</TabItem>
-<TabItem value="ollama" label="Ollama (Local)">
-
-```bash
-llama stack build --distro ollama --image-type venv
-llama stack run ollama
-```
-
-**Features:**
- Local model execution
- Privacy-focused
- No internet required
-
-</TabItem>
-<TabItem value="meta-reference" label="Meta Reference">
-
-```bash
-llama stack build --distro meta-reference --image-type venv
-llama stack run meta-reference
-```
-
-**Features:**
- Reference implementation
- All API features available
- Best for development
-
-</TabItem>
-</Tabs>
-
-## Use Cases & Examples
-
-### Educational Use Cases
- **Learning Llama Stack**: Hands-on exploration of API capabilities
- **Prompt Engineering**: Interactive testing of different prompting strategies
- **RAG Experimentation**: Understanding how document retrieval affects responses
- **Evaluation Understanding**: See how different metrics evaluate model performance
-
-### Development Use Cases
- **Prototype Testing**: Quick validation of application concepts
- **API Exploration**: Understanding available endpoints and parameters
- **Integration Planning**: Seeing how different components work together
- **Demo Creation**: Showcasing Llama Stack capabilities to stakeholders
-
-### Research Use Cases
- **Model Comparison**: Side-by-side testing of different models
- **Evaluation Design**: Understanding how scoring functions work
- **Safety Testing**: Exploring shield effectiveness with different inputs
- **Performance Analysis**: Measuring model behavior across different scenarios
-
-## Best Practices
-
-### 🚀 **Getting Started**
- Begin with simple chat interactions to understand basic functionality
- Gradually explore more advanced features like RAG and evaluations
- Use the inspection tools to understand your deployment's capabilities
-
-### 🔧 **Development Workflow**
- Use the playground to prototype before writing application code
- Test different parameter settings interactively
- Validate evaluation approaches before implementing them programmatically
-
-### 📊 **Evaluation & Testing**
- Start with simple scoring functions before trying complex evaluations
- Use the playground to understand evaluation results before automation
- Test safety features with various input types
-
-### 🎯 **Production Preparation**
- Use playground insights to inform your production API usage
- Test edge cases and error conditions interactively
- Validate resource configurations before deployment
-
-## Related Resources
-
- **[Getting Started Guide](../getting_started/quickstart)** - Complete setup and introduction
- **[Core Concepts](/docs/concepts)** - Understanding Llama Stack fundamentals
- **[Agents](./agent)** - Building intelligent agents
- **[RAG (Retrieval Augmented Generation)](./rag)** - Knowledge-enhanced applications
- **[Evaluations](./evals)** - Comprehensive evaluation framework
- **[API Reference](/docs/api/llama-stack-specification)** - Complete API documentation
+- Set up your [first agent](/docs/building_applications/agent)
+- Implement [RAG functionality](/docs/building_applications/rag)
+- Add [evaluation metrics](/docs/building_applications/evals)
+- Configure [safety measures](/docs/building_applications/safety)
--- a/docs/docs/building_applications/rag.mdx
+++ b/docs/docs/building_applications/rag.mdx
@ -10,358 +10,114 @@ import TabItem from '@theme/TabItem';

 # Retrieval Augmented Generation (RAG)

-RAG enables your applications to reference and recall information from previous interactions or external documents.
+
+RAG enables your applications to reference and recall information from external documents. Llama Stack makes Agentic RAG available through OpenAI's Responses API.
+
+## Quick Start
+
+### 1. Start the Server
+
+In one terminal, start the Llama Stack server:
+
+```bash
+llama stack list-deps starter | xargs -L1 uv pip install
+llama stack run starter
+```
+
+### 2. Connect with OpenAI Client
+
+In another terminal, use the standard OpenAI client with the Responses API:
+
+```python
+import io, requests
+from openai import OpenAI
+
+url = "https://www.paulgraham.com/greatwork.html"
+client = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")
+
+# Create vector store - auto-detects default embedding model
+vs = client.vector_stores.create()
+
+response = requests.get(url)
+pseudo_file = io.BytesIO(str(response.content).encode('utf-8'))
+file_id = client.files.create(file=(url, pseudo_file, "text/html"), purpose="assistants").id
+client.vector_stores.files.create(vector_store_id=vs.id, file_id=file_id)
+
+resp = client.responses.create(
+    model="gpt-4o",
+    input="How do you do great work? Use the existing knowledge_search tool.",
+    tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
+    include=["file_search_call.results"],
+)
+
+print(resp.output[-1].content[-1].text)
+```
+Which should give output like:
+```
+Doing great work is about more than just hard work and ambition; it involves combining several elements:
+
+1. **Pursue What Excites You**: Engage in projects that are both ambitious and exciting to you. It's important to work on something you have a natural aptitude for and a deep interest in.
+
+2. **Explore and Discover**: Great work often feels like a blend of discovery and creation. Focus on seeing possibilities and let ideas take their natural shape, rather than just executing a plan.
+
+3. **Be Bold Yet Flexible**: Take bold steps in your work without over-planning. An adaptable approach that evolves with new ideas can often lead to breakthroughs.
+
+4. **Work on Your Own Projects**: Develop a habit of working on projects of your own choosing, as these often lead to great achievements. These should be projects you find exciting and that challenge you intellectually.
+
+5. **Be Earnest and Authentic**: Approach your work with earnestness and authenticity. Trying to impress others with affectation can be counterproductive, as genuine effort and intellectual honesty lead to better work outcomes.
+
+6. **Build a Supportive Environment**: Work alongside great colleagues who inspire you and enhance your work. Surrounding yourself with motivating individuals creates a fertile environment for great work.
+
+7. **Maintain High Morale**: High morale significantly impacts your ability to do great work. Stay optimistic and protect your mental well-being to maintain progress and momentum.
+
+8. **Balance**: While hard work is essential, overworking can lead to diminishing returns. Balance periods of intensive work with rest to sustain productivity over time.
+
+This approach shows that great work is less about following a strict formula and more about aligning your interests, ambition, and environment to foster creativity and innovation.
+```

 ## Architecture Overview

-Llama Stack organizes the APIs that enable RAG into three layers:
+Llama Stack provides OpenAI-compatible RAG capabilities through:

-1. **Lower-Level APIs**: Deal with raw storage and retrieval. These include Vector IO, KeyValue IO (coming soon) and Relational IO (also coming soon)
-2. **RAG Tool**: A first-class tool as part of the [Tools API](./tools) that allows you to ingest documents (from URLs, files, etc) with various chunking strategies and query them smartly
-3. **Agents API**: The top-level [Agents API](./agent) that allows you to create agents that can use the tools to answer questions, perform tasks, and more
+- **Vector Stores API**: OpenAI-compatible vector storage with automatic embedding model detection
+- **Files API**: Document upload and processing using OpenAI's file format
+- **Responses API**: Enhanced chat completions with agentic tool calling via file search

-![RAG System Architecture](/img/rag.png)
+## Configuring Default Embedding Models

-The RAG system uses lower-level storage for different types of data:
- **Vector IO**: For semantic search and retrieval
- **Key-Value and Relational IO**: For structured data storage
+To enable automatic vector store creation without specifying embedding models, configure a default embedding model in your run.yaml like so:

-:::info[Future Storage Types]
-We may add more storage types like Graph IO in the future.
-:::
-
-## Setting up Vector Databases
-
-For this guide, we will use [Ollama](https://ollama.com/) as the inference provider. Ollama is an LLM runtime that allows you to run Llama models locally.
-
-Here's how to set up a vector database for RAG:
-
-```python
-# Create HTTP client
-import os
-from llama_stack_client import LlamaStackClient
-
-client = LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")
-
-# Register a vector database
-vector_db_id = "my_documents"
-response = client.vector_dbs.register(
-    vector_db_id=vector_db_id,
-    embedding_model="all-MiniLM-L6-v2",
-    embedding_dimension=384,
-    provider_id="faiss",
-)
+```yaml
+vector_stores:
+  default_provider_id: faiss
+  default_embedding_model:
+    provider_id: sentence-transformers
+    model_id: nomic-ai/nomic-embed-text-v1.5
 ```

-## Document Ingestion
+With this configuration:
+- `client.vector_stores.create()` works without requiring embedding model or provider parameters
+- The system automatically uses the default vector store provider (`faiss`) when multiple providers are available
+- The system automatically uses the default embedding model (`sentence-transformers/nomic-ai/nomic-embed-text-v1.5`) for any newly created vector store
+- The `default_provider_id` specifies which vector storage backend to use
+- The `default_embedding_model` specifies both the inference provider and model for embeddings

-You can ingest documents into the vector database using two methods: directly inserting pre-chunked documents or using the RAG Tool.
+## Vector Store Operations

-### Direct Document Insertion
+### Creating Vector Stores

-<Tabs>
-<TabItem value="basic" label="Basic Insertion">
+You can create vector stores with automatic or explicit embedding model selection:

 ```python
-# You can insert a pre-chunked document directly into the vector db
-chunks = [
-    {
-        "content": "Your document text here",
-        "mime_type": "text/plain",
-        "metadata": {
-            "document_id": "doc1",
-            "author": "Jane Doe",
-        },
-    },
-]
-client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
-```
+# Automatic - uses default configured embedding model and vector store provider
+vs = client.vector_stores.create()

-</TabItem>
-<TabItem value="embeddings" label="With Precomputed Embeddings">
-
-If you decide to precompute embeddings for your documents, you can insert them directly into the vector database by including the embedding vectors in the chunk data. This is useful if you have a separate embedding service or if you want to customize the ingestion process.
-
-```python
-chunks_with_embeddings = [
-    {
-        "content": "First chunk of text",
-        "mime_type": "text/plain",
-        "embedding": [0.1, 0.2, 0.3, ...],  # Your precomputed embedding vector
-        "metadata": {"document_id": "doc1", "section": "introduction"},
-    },
-    {
-        "content": "Second chunk of text",
-        "mime_type": "text/plain",
-        "embedding": [0.2, 0.3, 0.4, ...],  # Your precomputed embedding vector
-        "metadata": {"document_id": "doc1", "section": "methodology"},
-    },
-]
-client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks_with_embeddings)
-```
-
-:::warning[Embedding Dimensions]
-When providing precomputed embeddings, ensure the embedding dimension matches the `embedding_dimension` specified when registering the vector database.
-:::
-
-</TabItem>
-</Tabs>
-
-### Document Retrieval
-
-You can query the vector database to retrieve documents based on their embeddings.
-
-```python
-# You can then query for these chunks
-chunks_response = client.vector_io.query(
-    vector_db_id=vector_db_id,
-    query="What do you know about..."
-)
-```
-
-## Using the RAG Tool
-
-:::danger[Deprecation Notice]
-The RAG Tool is being deprecated in favor of directly using the OpenAI-compatible Search API. We recommend migrating to the OpenAI APIs for better compatibility and future support.
-:::
-
-A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc. and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the [appendix](#more-ragdocument-examples).
-
-### OpenAI API Integration & Migration
-
-The RAG tool has been updated to use OpenAI-compatible APIs. This provides several benefits:
-
- **Files API Integration**: Documents are now uploaded using OpenAI's file upload endpoints
- **Vector Stores API**: Vector storage operations use OpenAI's vector store format with configurable chunking strategies
- **Error Resilience**: When processing multiple documents, individual failures are logged but don't crash the operation. Failed documents are skipped while successful ones continue processing.
-
-### Migration Path
-
-We recommend migrating to the OpenAI-compatible Search API for:
-
-1. **Better OpenAI Ecosystem Integration**: Direct compatibility with OpenAI tools and workflows including the Responses API
-2. **Future-Proof**: Continued support and feature development
-3. **Full OpenAI Compatibility**: Vector Stores, Files, and Search APIs are fully compatible with OpenAI's Responses API
-
-The OpenAI APIs are used under the hood, so you can continue to use your existing RAG Tool code with minimal changes. However, we recommend updating your code to use the new OpenAI-compatible APIs for better long-term support. If any documents fail to process, they will be logged in the response but will not cause the entire operation to fail.
-
-### RAG Tool Example
-
-```python
-from llama_stack_client import RAGDocument
-
-urls = ["memory_optimizations.rst", "chat.rst", "llama3.rst"]
-documents = [
-    RAGDocument(
-        document_id=f"num-{i}",
-        content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
-        mime_type="text/plain",
-        metadata={},
-    )
-    for i, url in enumerate(urls)
-]
-
-client.tool_runtime.rag_tool.insert(
-    documents=documents,
-    vector_db_id=vector_db_id,
-    chunk_size_in_tokens=512,
-)
-
-# Query documents
-results = client.tool_runtime.rag_tool.query(
-    vector_db_ids=[vector_db_id],
-    content="What do you know about...",
-)
-```
-
-### Custom Context Configuration
-
-You can configure how the RAG tool adds metadata to the context if you find it useful for your application:
-
-```python
-# Query documents with custom template
-results = client.tool_runtime.rag_tool.query(
-    vector_db_ids=[vector_db_id],
-    content="What do you know about...",
-    query_config={
-        "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
-    },
-)
-```
-
-## Building RAG-Enhanced Agents
-
-One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
-
-### Agent with Knowledge Search
-
-```python
-from llama_stack_client import Agent
-
-# Create agent with memory
-agent = Agent(
-    client,
-    model="meta-llama/Llama-3.3-70B-Instruct",
-    instructions="You are a helpful assistant",
-    tools=[
-        {
-            "name": "builtin::rag/knowledge_search",
-            "args": {
-                "vector_db_ids": [vector_db_id],
-                # Defaults
-                "query_config": {
-                    "chunk_size_in_tokens": 512,
-                    "chunk_overlap_in_tokens": 0,
-                    "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
-                },
-            },
+# Explicit - specify embedding model and/or provider when you need specific ones
+vs = client.vector_stores.create(
+    extra_body={
+        "provider_id": "faiss",  # Optional: specify vector store provider
+        "embedding_model": "sentence-transformers/nomic-ai/nomic-embed-text-v1.5",
+        "embedding_dimension": 768  # Optional: will be auto-detected if not provided
    }
-    ],
-)
-session_id = agent.create_session("rag_session")
-
-# Ask questions about documents in the vector db, and the agent will query the db to answer the question.
-response = agent.create_turn(
-    messages=[{"role": "user", "content": "How to optimize memory in PyTorch?"}],
-    session_id=session_id,
 )
 ```
-
-:::tip[Agent Instructions]
-The `instructions` field in the `AgentConfig` can be used to guide the agent's behavior. It is important to experiment with different instructions to see what works best for your use case.
-:::
-
-### Document-Aware Conversations
-
-You can also pass documents along with the user's message and ask questions about them:
-
-```python
-# Initial document ingestion
-response = agent.create_turn(
-    messages=[
-        {"role": "user", "content": "I am providing some documents for reference."}
-    ],
-    documents=[
-        {
-            "content": "https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/memory_optimizations.rst",
-            "mime_type": "text/plain",
-        }
-    ],
-    session_id=session_id,
-)
-
-# Query with RAG
-response = agent.create_turn(
-    messages=[{"role": "user", "content": "What are the key topics in the documents?"}],
-    session_id=session_id,
-)
-```
-
-### Viewing Agent Responses
-
-You can print the response with the following:
-
-```python
-from llama_stack_client import AgentEventLogger
-
-for log in AgentEventLogger().log(response):
-    log.print()
-```
-
-## Vector Database Management
-
-### Unregistering Vector DBs
-
-If you need to clean up and unregister vector databases, you can do so as follows:
-
-<Tabs>
-<TabItem value="single" label="Single Database">
-
-```python
-# Unregister a specified vector database
-vector_db_id = "my_vector_db_id"
-print(f"Unregistering vector database: {vector_db_id}")
-client.vector_dbs.unregister(vector_db_id=vector_db_id)
-```
-
-</TabItem>
-<TabItem value="all" label="All Databases">
-
-```python
-# Unregister all vector databases
-for vector_db_id in client.vector_dbs.list():
-    print(f"Unregistering vector database: {vector_db_id.identifier}")
-    client.vector_dbs.unregister(vector_db_id=vector_db_id.identifier)
-```
-
-</TabItem>
-</Tabs>
-
-## Best Practices
-
-### 🎯 **Document Chunking**
- Use appropriate chunk sizes (512 tokens is often a good starting point)
- Consider overlap between chunks for better context preservation
- Experiment with different chunking strategies for your content type
-
-### 🔍 **Embedding Strategy**
- Choose embedding models that match your domain
- Consider the trade-off between embedding dimension and performance
- Test different embedding models for your specific use case
-
-### 📊 **Query Optimization**
- Use specific, well-formed queries for better retrieval
- Experiment with different search strategies
- Consider hybrid approaches (keyword + semantic search)
-
-### 🛡️ **Error Handling**
- Implement proper error handling for failed document processing
- Monitor ingestion success rates
- Have fallback strategies for retrieval failures
-
-## Appendix
-
-### More RAGDocument Examples
-
-Here are various ways to create RAGDocument objects for different content types:
-
-```python
-from llama_stack_client import RAGDocument
-import base64
-
-# File URI
-RAGDocument(document_id="num-0", content={"uri": "file://path/to/file"})
-
-# Plain text
-RAGDocument(document_id="num-1", content="plain text")
-
-# Explicit text input
-RAGDocument(
-    document_id="num-2",
-    content={
-        "type": "text",
-        "text": "plain text input",
-    },  # for inputs that should be treated as text explicitly
-)
-
-# Image from URL
-RAGDocument(
-    document_id="num-3",
-    content={
-        "type": "image",
-        "image": {"url": {"uri": "https://mywebsite.com/image.jpg"}},
-    },
-)
-
-# Base64 encoded image
-B64_ENCODED_IMAGE = base64.b64encode(
-    requests.get(
-        "https://raw.githubusercontent.com/meta-llama/llama-stack/refs/heads/main/docs/_static/llama-stack.png"
-    ).content
-)
-RAGDocument(
-    document_id="num-4",
-    content={"type": "image", "image": {"data": B64_ENCODED_IMAGE}},
-)
-```
-For more strongly typed interaction use the typed dicts found [here](https://github.com/meta-llama/llama-stack-client-python/blob/38cd91c9e396f2be0bec1ee96a19771582ba6f17/src/llama_stack_client/types/shared_params/document.py).
--- a/docs/docs/building_applications/safety.mdx
+++ b/docs/docs/building_applications/safety.mdx
@ -391,5 +391,4 @@ client.shields.register(
 - **[Agents](./agent)** - Integrating safety shields with intelligent agents
 - **[Agent Execution Loop](./agent_execution_loop)** - Understanding safety in the execution flow
 - **[Evaluations](./evals)** - Evaluating safety shield effectiveness
- **[Telemetry](./telemetry)** - Monitoring safety violations and metrics
 - **[Llama Guard Documentation](https://github.com/meta-llama/PurpleLlama/tree/main/Llama-Guard3)** - Advanced safety model details
--- a/docs/docs/building_applications/telemetry.mdx
+++ b/docs/docs/building_applications/telemetry.mdx
@ -10,58 +10,8 @@ import TabItem from '@theme/TabItem';

 # Telemetry

-The Llama Stack telemetry system provides comprehensive tracing, metrics, and logging capabilities. It supports multiple sink types including OpenTelemetry, SQLite, and Console output for complete observability of your AI applications.
+The Llama Stack uses OpenTelemetry to provide comprehensive tracing, metrics, and logging capabilities.

-## Event Types
-
-The telemetry system supports three main types of events:
-
-<Tabs>
-<TabItem value="unstructured" label="Unstructured Logs">
-
-Free-form log messages with severity levels for general application logging:
-
-```python
-unstructured_log_event = UnstructuredLogEvent(
-    message="This is a log message",
-    severity=LogSeverity.INFO
-)
-```
-
-</TabItem>
-<TabItem value="metrics" label="Metric Events">
-
-Numerical measurements with units for tracking performance and usage:
-
-```python
-metric_event = MetricEvent(
-    metric="my_metric",
-    value=10,
-    unit="count"
-)
-```
-
-</TabItem>
-<TabItem value="structured" label="Structured Logs">
-
-System events like span start/end that provide structured operation tracking:
-
-```python
-structured_log_event = SpanStartPayload(
-    name="my_span",
-    parent_span_id="parent_span_id"
-)
-```
-
-</TabItem>
-</Tabs>
-
-## Spans and Traces
-
- **Spans**: Represent individual operations with timing information and hierarchical relationships
- **Traces**: Collections of related spans that form a complete request flow across your application
-
-This hierarchical structure allows you to understand the complete execution path of requests through your Llama Stack application.

 ## Automatic Metrics Generation

@ -129,21 +79,6 @@ Send events to an OpenTelemetry Collector for integration with observability pla
 - Compatible with all OpenTelemetry collectors
 - Supports both traces and metrics

-</TabItem>
-<TabItem value="sqlite" label="SQLite">
-
-Store events in a local SQLite database for direct querying:
-
-**Use Cases:**
- Local development and debugging
- Custom analytics and reporting
- Offline analysis of application behavior
-
-**Features:**
- Direct SQL querying capabilities
- Persistent local storage
- No external dependencies
-
 </TabItem>
 <TabItem value="console" label="Console">

@ -174,9 +109,8 @@ telemetry:
    provider_type: inline::meta-reference
    config:
      service_name: "llama-stack-service"
-      sinks: ['console', 'sqlite', 'otel_trace', 'otel_metric']
+      sinks: ['console', 'otel_trace', 'otel_metric']
      otel_exporter_otlp_endpoint: "http://localhost:4318"
-      sqlite_db_path: "/path/to/telemetry.db"
 ```

 ### Environment Variables
@ -185,7 +119,7 @@ Configure telemetry behavior using environment variables:

 - **`OTEL_EXPORTER_OTLP_ENDPOINT`**: OpenTelemetry Collector endpoint (default: `http://localhost:4318`)
 - **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string)
- **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `console,sqlite`)
+- **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `[]`)

 ### Quick Setup: Complete Telemetry Stack

@ -248,37 +182,10 @@ Forward metrics to other observability systems:
 </TabItem>
 </Tabs>

-## SQLite Querying
-
-The `sqlite` sink allows you to query traces without an external system. This is particularly useful for development and custom analytics.
-
-### Example Queries
-
-```sql
-- Query recent traces
-SELECT * FROM traces WHERE timestamp > datetime('now', '-1 hour');
-
-- Analyze span durations
-SELECT name, AVG(duration_ms) as avg_duration
-FROM spans
-GROUP BY name
-ORDER BY avg_duration DESC;
-
-- Find slow operations
-SELECT * FROM spans
-WHERE duration_ms > 1000
-ORDER BY duration_ms DESC;
-```
-
-:::tip[Advanced Analytics]
-Refer to the [Getting Started notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) for more examples on querying traces and spans programmatically.
-:::
-
 ## Best Practices

 ### 🔍 **Monitoring Strategy**
 - Use OpenTelemetry for production environments
- Combine multiple sinks for development (console + SQLite)
 - Set up alerts on key metrics like token usage and error rates

 ### 📊 **Metrics Analysis**
@ -293,45 +200,8 @@ Refer to the [Getting Started notebook](https://github.com/meta-llama/llama-stac

 ### 🔧 **Configuration Management**
 - Use environment variables for flexible deployment
- Configure appropriate retention policies for SQLite
 - Ensure proper network access to OpenTelemetry collectors

-## Integration Examples
-
-### Basic Telemetry Setup
-
-```python
-from llama_stack_client import LlamaStackClient
-
-# Client with telemetry headers
-client = LlamaStackClient(
-    base_url="http://localhost:8000",
-    extra_headers={
-        "X-Telemetry-Service": "my-ai-app",
-        "X-Telemetry-Version": "1.0.0"
-    }
-)
-
-# All API calls will be automatically traced
-response = client.chat.completions.create(
-    model="meta-llama/Llama-3.2-3B-Instruct",
-    messages=[{"role": "user", "content": "Hello!"}]
-)
-```
-
-### Custom Telemetry Context
-
-```python
-# Add custom span attributes for better tracking
-with tracer.start_as_current_span("custom_operation") as span:
-    span.set_attribute("user_id", "user123")
-    span.set_attribute("operation_type", "chat_completion")
-
-    response = client.chat.completions.create(
-        model="meta-llama/Llama-3.2-3B-Instruct",
-        messages=[{"role": "user", "content": "Hello!"}]
-    )
-```

 ## Related Resources

--- a/docs/docs/building_applications/tools.mdx
+++ b/docs/docs/building_applications/tools.mdx
@ -104,23 +104,19 @@ client.toolgroups.register(
 )
 ```

-Note that most of the more useful MCP servers need you to authenticate with them. Many of them use OAuth2.0 for authentication. You can provide authorization headers to send to the MCP server using the "Provider Data" abstraction provided by Llama Stack. When making an agent call,
+Note that most of the more useful MCP servers need you to authenticate with them. Many of them use OAuth2.0 for authentication. You can provide the authorization token when creating the Agent:

 ```python
 agent = Agent(
    ...,
-    tools=["mcp::deepwiki"],
-    extra_headers={
-        "X-LlamaStack-Provider-Data": json.dumps(
+    tools=[
        {
-                "mcp_headers": {
-                    "http://mcp.deepwiki.com/sse": {
-                        "Authorization": "Bearer <your_access_token>",
-                    },
-                },
+            "type": "mcp",
+            "server_url": "https://mcp.deepwiki.com/sse",
+            "server_label": "mcp::deepwiki",
+            "authorization": "<your_access_token>",  # OAuth token (without "Bearer " prefix)
        }
-        ),
-    },
+    ],
 )
 agent.create_turn(...)
 ```
--- a/docs/docs/concepts/apis/api_leveling.mdx
+++ b/docs/docs/concepts/apis/api_leveling.mdx
@ -62,6 +62,10 @@ The new `/v2` API must be introduced alongside the existing `/v1` API and run in

 When a `/v2` API is introduced, a clear and generous deprecation policy for the `/v1` API must be published simultaneously. This policy must outline the timeline for the eventual removal of the `/v1` API, giving users ample time to migrate.

+### Deprecated APIs
+
+Deprecated APIs are those that are no longer actively maintained or supported. Depreated APIs are marked with the flag `deprecated = True` in the OpenAPI spec. These APIs will be removed in a future release.
+
 ### API Stability vs. Provider Stability

 The leveling introduced in this document relates to the stability of the API and not specifically the providers within the API.
--- a/docs/docs/concepts/apis/external.mdx
+++ b/docs/docs/concepts/apis/external.mdx
@ -58,7 +58,7 @@ External APIs must expose a `available_providers()` function in their module tha

 ```python
 # llama_stack_api_weather/api.py
-from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
+from llama_stack_api import Api, InlineProviderSpec, ProviderSpec


 def available_providers() -> list[ProviderSpec]:
@ -79,7 +79,7 @@ A Protocol class like so:
 # llama_stack_api_weather/api.py
 from typing import Protocol

-from llama_stack.schema_utils import webmethod
+from llama_stack_api import webmethod


 class WeatherAPI(Protocol):
@ -151,13 +151,12 @@ __all__ = ["WeatherAPI", "available_providers"]
 # llama-stack-api-weather/src/llama_stack_api_weather/weather.py
 from typing import Protocol

-from llama_stack.providers.datatypes import (
+from llama_stack_api import (
    Api,
    ProviderSpec,
    RemoteProviderSpec,
+    webmethod,
 )
-from llama_stack.schema_utils import webmethod
-

 def available_providers() -> list[ProviderSpec]:
    return [
--- a/docs/docs/concepts/apis/index.mdx
+++ b/docs/docs/concepts/apis/index.mdx
@ -7,7 +7,7 @@ sidebar_position: 1

 # APIs

-A Llama Stack API is described as a collection of REST endpoints. We currently support the following APIs:
+A Llama Stack API is described as a collection of REST endpoints following OpenAI API standards. We currently support the following APIs:

 - **Inference**: run inference with a LLM
 - **Safety**: apply safety policies to the output at a Systems (not only model) level
@ -16,13 +16,26 @@ A Llama Stack API is described as a collection of REST endpoints. We currently s
 - **Scoring**: evaluate outputs of the system
 - **Eval**: generate outputs (via Inference or Agents) and perform scoring
 - **VectorIO**: perform operations on vector stores, such as adding documents, searching, and deleting documents
+- **Files**: manage file uploads, storage, and retrieval
 - **Telemetry**: collect telemetry data from the system
 - **Post Training**: fine-tune a model
 - **Tool Runtime**: interact with various tools and protocols
- **Responses**: generate responses from an LLM using this OpenAI compatible API.
+- **Responses**: generate responses from an LLM

 We are working on adding a few more APIs to complete the application lifecycle. These will include:
 - **Batch Inference**: run inference on a dataset of inputs
 - **Batch Agents**: run agents on a dataset of inputs
- **Synthetic Data Generation**: generate synthetic data for model development
 - **Batches**: OpenAI-compatible batch management for inference
+
+
+## OpenAI API Compatibility
+We are working on adding OpenAI API compatibility to Llama Stack. This will allow you to use Llama Stack with OpenAI API clients and tools.
+
+### File Operations and Vector Store Integration
+
+The Files API and Vector Store APIs work together through file operations, enabling automatic document processing and search. This integration implements the [OpenAI Vector Store Files API specification](https://platform.openai.com/docs/api-reference/vector-stores-files) and allows you to:
+- Upload documents through the Files API
+- Automatically process and chunk documents into searchable vectors
+- Store processed content in vector databases based on the availability of [our providers](../../providers/index.mdx)
+- Search through documents using natural language queries
+For detailed information about this integration, see [File Operations and Vector Store Integration](../file_operations_vector_stores.md).
--- a/docs/docs/concepts/file_operations_vector_stores.mdx
+++ b/docs/docs/concepts/file_operations_vector_stores.mdx
@ -0,0 +1,420 @@
+# File Operations and Vector Store Integration
+
+## Overview
+
+Llama Stack provides seamless integration between the Files API and Vector Store APIs, enabling you to upload documents and automatically process them into searchable vector embeddings. This integration implements file operations following the [OpenAI Vector Store Files API specification](https://platform.openai.com/docs/api-reference/vector-stores-files).
+
+## Enhanced Capabilities Beyond OpenAI
+
+While Llama Stack maintains full compatibility with OpenAI's Vector Store API, it provides several additional capabilities that enhance functionality and flexibility:
+
+### **Embedding Model Specification**
+Unlike OpenAI's vector stores which use a fixed embedding model, Llama Stack allows you to specify which embedding model to use when creating a vector store:
+
+```python
+# Create vector store with specific embedding model
+vector_store = client.vector_stores.create(
+    name="my_documents",
+    embedding_model="all-MiniLM-L6-v2",  # Specify your preferred model
+    embedding_dimension=384,
+)
+```
+
+### **Advanced Search Modes**
+Llama Stack supports multiple search modes beyond basic vector similarity:
+
+- **Vector Search**: Pure semantic similarity search using embeddings
+- **Keyword Search**: Traditional keyword-based search for exact matches
+- **Hybrid Search**: Combines both vector and keyword search for optimal results
+
+```python
+# Different search modes
+results = await client.vector_stores.search(
+    vector_store_id=vector_store.id,
+    query="machine learning algorithms",
+    search_mode="hybrid",  # or "vector", "keyword"
+    max_num_results=5,
+)
+```
+
+### **Flexible Ranking Options**
+For hybrid search, Llama Stack offers configurable ranking strategies:
+
+- **RRF (Reciprocal Rank Fusion)**: Combines rankings with configurable impact factor
+- **Weighted Ranker**: Linear combination of vector and keyword scores with adjustable weights
+
+```python
+# Custom ranking configuration
+results = await client.vector_stores.search(
+    vector_store_id=vector_store.id,
+    query="neural networks",
+    search_mode="hybrid",
+    ranking_options={
+        "ranker": {"type": "weighted", "alpha": 0.7}  # 70% vector, 30% keyword
+    },
+)
+```
+
+### **Provider Selection**
+Choose from multiple vector store providers based on your specific needs:
+
+- **Inline Providers**: FAISS (fast in-memory), SQLite-vec (disk-based), Milvus (high-performance)
+- **Remote Providers**: ChromaDB, Qdrant, Weaviate, Postgres (PGVector), Milvus
+
+```python
+# Specify provider when creating vector store
+vector_store = client.vector_stores.create(
+    name="my_documents", provider_id="sqlite-vec"  # Choose your preferred provider
+)
+```
+
+## How It Works
+
+The file operations work through several key components:
+
+1. **File Upload**: Documents are uploaded through the Files API
+2. **Automatic Processing**: Files are automatically chunked and converted to embeddings
+3. **Vector Storage**: Chunks are stored in vector databases with metadata
+4. **Search & Retrieval**: Users can search through processed documents using natural language
+
+## Supported Vector Store Providers
+
+The following vector store providers support file operations:
+
+### Inline Providers (Single Node)
+
+- **FAISS**: Fast in-memory vector similarity search
+- **SQLite-vec**: Disk-based storage with hybrid search capabilities
+
+### Remote Providers (Hosted)
+
+- **ChromaDB**: Vector database with metadata filtering
+- **Weaviate**: Vector database with GraphQL interface
+- **Postgres (PGVector)**: Vector extensions for PostgreSQL
+
+### Both Inline & Remote Providers
+- **Milvus**: High-performance vector database with advanced indexing
+- **Qdrant**: Vector similarity search with payload filtering
+
+## File Processing Pipeline
+
+### 1. File Upload
+
+```python
+from llama_stack import LlamaStackClient
+
+client = LlamaStackClient("http://localhost:8000")
+
+# Upload a document
+with open("document.pdf", "rb") as f:
+    file_info = await client.files.upload(file=f, purpose="assistants")
+```
+
+### 2. Attach to Vector Store
+
+```python
+# Create a vector store
+vector_store = client.vector_stores.create(name="my_documents")
+
+# Attach the file to the vector store
+file_attach_response = await client.vector_stores.files.create(
+    vector_store_id=vector_store.id, file_id=file_info.id
+)
+```
+
+### 3. Automatic Processing
+
+The system automatically:
+- Detects the file type and extracts text content
+- Splits content into chunks (default: 800 tokens with 400 token overlap)
+- Generates embeddings for each chunk
+- Stores chunks with metadata in the vector store
+- Updates file status to "completed"
+
+### 4. Search and Retrieval
+
+```python
+# Search through processed documents
+search_results = await client.vector_stores.search(
+    vector_store_id=vector_store.id,
+    query="What is the main topic discussed?",
+    max_num_results=5,
+)
+
+# Process results
+for result in search_results.data:
+    print(f"Score: {result.score}")
+    for content in result.content:
+        print(f"Content: {content.text}")
+```
+
+## Supported File Types
+
+The FileResponse system supports various document formats:
+
+- **Text Files**: `.txt`, `.md`, `.rst`
+- **Documents**: `.pdf`, `.docx`, `.doc`
+- **Code**: `.py`, `.js`, `.java`, `.cpp`, etc.
+- **Data**: `.json`, `.csv`, `.xml`
+- **Web Content**: HTML files
+
+## Chunking Strategies
+
+### Default Strategy
+
+The default chunking strategy uses:
+- **Max Chunk Size**: 800 tokens
+- **Overlap**: 400 tokens
+- **Method**: Semantic boundary detection
+
+### Custom Chunking
+
+You can customize chunking when attaching files:
+
+```python
+from llama_stack.apis.vector_io import VectorStoreChunkingStrategy
+
+# Attach file with custom chunking
+file_attach_response = await client.vector_stores.files.create(
+    vector_store_id=vector_store.id,
+    file_id=file_info.id,
+    chunking_strategy=chunking_strategy,
+)
+```
+
+**Note**: While Llama Stack is OpenAI-compatible, it also supports additional options beyond the standard OpenAI API. When creating vector stores, you can specify custom embedding models and embedding dimensions that will be used when processing chunks from attached files.
+
+
+## File Management
+
+### List Files in Vector Store
+
+```python
+# List all files in a vector store
+files = await client.vector_stores.files.list(vector_store_id=vector_store.id)
+
+for file in files:
+    print(f"File: {file.filename}, Status: {file.status}")
+```
+
+### File Status Tracking
+
+Files go through several statuses:
+- **in_progress**: File is being processed
+- **completed**: File successfully processed and searchable
+- **failed**: Processing failed (check `last_error` for details)
+- **cancelled**: Processing was cancelled
+
+### Retrieve File Content
+
+```python
+# Get chunked content from vector store
+content_response = await client.vector_stores.files.retrieve_content(
+    vector_store_id=vector_store.id, file_id=file_info.id
+)
+
+for chunk in content_response.content:
+    print(f"Chunk {chunk.metadata.get('chunk_index', 0)}: {chunk.text}")
+```
+
+## Vector Store Management
+
+### List Vector Stores
+
+Retrieve a paginated list of all vector stores:
+
+```python
+# List all vector stores with default pagination
+vector_stores = await client.vector_stores.list()
+
+# Custom pagination and ordering
+vector_stores = await client.vector_stores.list(
+    limit=10,
+    order="asc",  # or "desc"
+    after="vs_12345678",  # cursor-based pagination
+)
+
+for store in vector_stores.data:
+    print(f"Store: {store.name}, Files: {store.file_counts.total}")
+    print(f"Created: {store.created_at}, Status: {store.status}")
+```
+
+### Retrieve Vector Store Details
+
+Get detailed information about a specific vector store:
+
+```python
+# Get vector store details
+store_details = await client.vector_stores.retrieve(vector_store_id="vs_12345678")
+
+print(f"Name: {store_details.name}")
+print(f"Status: {store_details.status}")
+print(f"File Counts: {store_details.file_counts}")
+print(f"Usage: {store_details.usage_bytes} bytes")
+print(f"Created: {store_details.created_at}")
+print(f"Metadata: {store_details.metadata}")
+```
+
+### Update Vector Store
+
+Modify vector store properties such as name, metadata, or expiration settings:
+
+```python
+# Update vector store name and metadata
+updated_store = await client.vector_stores.update(
+    vector_store_id="vs_12345678",
+    name="Updated Document Collection",
+    metadata={
+        "description": "Updated collection for research",
+        "category": "research",
+        "version": "2.0",
+    },
+)
+
+# Set expiration policy
+expired_store = await client.vector_stores.update(
+    vector_store_id="vs_12345678",
+    expires_after={"anchor": "last_active_at", "days": 30},
+)
+
+print(f"Updated store: {updated_store.name}")
+print(f"Last active: {updated_store.last_active_at}")
+```
+
+### Delete Vector Store
+
+Remove a vector store and all its associated data:
+
+```python
+# Delete a vector store
+delete_response = await client.vector_stores.delete(vector_store_id="vs_12345678")
+
+if delete_response.deleted:
+    print(f"Vector store {delete_response.id} successfully deleted")
+else:
+    print("Failed to delete vector store")
+```
+
+**Important Notes:**
+- Deleting a vector store removes all files, chunks, and embeddings
+- This operation cannot be undone
+- The underlying vector database is also cleaned up
+- Consider backing up important data before deletion
+
+## Search Capabilities
+
+### Vector Search
+
+Pure similarity search using embeddings:
+
+```python
+results = await client.vector_stores.search(
+    vector_store_id=vector_store.id,
+    query="machine learning algorithms",
+    max_num_results=10,
+)
+```
+
+### Filtered Search
+
+Combine vector search with metadata filtering:
+
+```python
+results = await client.vector_stores.search(
+    vector_store_id=vector_store.id,
+    query="machine learning algorithms",
+    filters={"file_type": "pdf", "upload_date": "2024-01-01"},
+    max_num_results=10,
+)
+```
+
+### Hybrid Search
+
+[SQLite-vec](../providers/vector_io/inline_sqlite-vec.mdx), [pgvector](../providers/vector_io/remote_pgvector.mdx), and [Milvus](../providers/vector_io/inline_milvus.mdx) support combining vector and keyword search.
+
+## Performance Considerations
+
+> **Note**: For detailed performance optimization strategies, see [Performance Considerations](../providers/files/openai_file_operations_support.md#performance-considerations) in the provider documentation.
+
+**Key Points:**
+- **Chunk Size**: 400-600 tokens for precision, 800-1200 for context
+- **Storage**: Choose provider based on your performance needs
+- **Search**: Optimize for your specific use case
+
+## Error Handling
+
+> **Note**: For comprehensive troubleshooting and error handling, see [Troubleshooting](../providers/files/openai_file_operations_support.md#troubleshooting) in the provider documentation.
+
+**Common Issues:**
+- File processing failures (format, size limits)
+- Search performance optimization
+- Storage and memory issues
+
+## Best Practices
+
+> **Note**: For detailed best practices and recommendations, see [Best Practices](../providers/files/openai_file_operations_support.md#best-practices) in the provider documentation.
+
+**Key Recommendations:**
+- File organization and naming conventions
+- Chunking strategy optimization
+- Metadata and monitoring practices
+- Regular cleanup and maintenance
+
+## Integration Examples
+
+### RAG Application
+
+```python
+# Build a RAG system with file uploads
+async def build_rag_system():
+    # Create vector store
+    vector_store = client.vector_stores.create(name="knowledge_base")
+
+    # Upload and process documents
+    documents = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
+    for doc in documents:
+        with open(doc, "rb") as f:
+            file_info = await client.files.create(file=f, purpose="assistants")
+            await client.vector_stores.files.create(
+                vector_store_id=vector_store.id, file_id=file_info.id
+            )
+
+    return vector_store
+
+
+# Query the RAG system
+async def query_rag(vector_store_id, question):
+    results = await client.vector_stores.search(
+        vector_store_id=vector_store_id, query=question, max_num_results=5
+    )
+    return results
+```
+
+### Document Analysis
+
+```python
+# Analyze document content through vector search
+async def analyze_document(vector_store_id, file_id):
+    # Get document content
+    content = await client.vector_stores.files.retrieve_content(
+        vector_store_id=vector_store_id, file_id=file_id
+    )
+
+    # Search for specific topics
+    topics = ["introduction", "methodology", "conclusion"]
+    analysis = {}
+
+    for topic in topics:
+        results = await client.vector_stores.search(
+            vector_store_id=vector_store_id, query=topic, max_num_results=3
+        )
+        analysis[topic] = results.data
+
+    return analysis
+```
+
+## Next Steps
+
+- Explore the [Files API documentation](../../providers/files/files.mdx) for detailed API reference
+- Check [Vector Store Providers](../providers/vector_io/index.mdx) for specific implementation details
+- Review [Getting Started](../getting_started/quickstart.mdx) for quick setup instructions
--- a/docs/docs/contributing/index.mdx
+++ b/docs/docs/contributing/index.mdx
@ -1,233 +1,13 @@
-# Contributing to Llama Stack
-We want to make contributing to this project as easy and transparent as
-possible.
+---
+title: Contributing
+description: Contributing to Llama Stack
+sidebar_label: Contributing to Llama Stack
+sidebar_position: 3
+hide_title: true
+---

-## Set up your development environment
+import Contributing from '!!raw-loader!../../../CONTRIBUTING.md';
+import ReactMarkdown from 'react-markdown';

-We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
-You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).

-You can install the dependencies by running:
-
-```bash
-cd llama-stack
-uv sync --group dev
-uv pip install -e .
-source .venv/bin/activate
-```
-
-```{note}
-You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.12`).
-Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
-For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
-```
-
-Note that you can create a dotenv file `.env` that includes necessary environment variables:
-```
-LLAMA_STACK_BASE_URL=http://localhost:8321
-LLAMA_STACK_CLIENT_LOG=debug
-LLAMA_STACK_PORT=8321
-LLAMA_STACK_CONFIG=<provider-name>
-TAVILY_SEARCH_API_KEY=
-BRAVE_SEARCH_API_KEY=
-```
-
-And then use this dotenv file when running client SDK tests via the following:
-```bash
-uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
-```
-
-### Pre-commit Hooks
-
-We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
-
-```bash
-uv run pre-commit install
-```
-
-After that, pre-commit hooks will run automatically before each commit.
-
-Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
-
-```bash
-uv run pre-commit run --all-files
-```
-
-```{caution}
-Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
-```
-
-## Discussions -> Issues -> Pull Requests
-
-We actively welcome your pull requests. However, please read the following. This is heavily inspired by [Ghostty](https://github.com/ghostty-org/ghostty/blob/main/CONTRIBUTING.md).
-
-If in doubt, please open a [discussion](https://github.com/meta-llama/llama-stack/discussions); we can always convert that to an issue later.
-
-### Issues
-We use GitHub issues to track public bugs. Please ensure your description is
-clear and has sufficient instructions to be able to reproduce the issue.
-
-Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
-disclosure of security bugs. In those cases, please go through the process
-outlined on that page and do not file a public issue.
-
-### Contributor License Agreement ("CLA")
-In order to accept your pull request, we need you to submit a CLA. You only need
-to do this once to work on any of Meta's open source projects.
-
-Complete your CLA here: [https://code.facebook.com/cla](https://code.facebook.com/cla)
-
-**I'd like to contribute!**
-
-If you are new to the project, start by looking at the issues tagged with "good first issue". If you're interested
-leave a comment on the issue and a triager will assign it to you.
-
-Please avoid picking up too many issues at once. This helps you stay focused and ensures that others in the community also have opportunities to contribute.
- Try to work on only 1–2 issues at a time, especially if you’re still getting familiar with the codebase.
- Before taking an issue, check if it’s already assigned or being actively discussed.
- If you’re blocked or can’t continue with an issue, feel free to unassign yourself or leave a comment so others can step in.
-
-**I have a bug!**
-
-1. Search the issue tracker and discussions for similar issues.
-2. If you don't have steps to reproduce, open a discussion.
-3. If you have steps to reproduce, open an issue.
-
-**I have an idea for a feature!**
-
-1. Open a discussion.
-
-**I've implemented a feature!**
-
-1. If there is an issue for the feature, open a pull request.
-2. If there is no issue, open a discussion and link to your branch.
-
-**I have a question!**
-
-1. Open a discussion or use [Discord](https://discord.gg/llama-stack).
-
-
-**Opening a Pull Request**
-
-1. Fork the repo and create your branch from `main`.
-2. If you've changed APIs, update the documentation.
-3. Ensure the test suite passes.
-4. Make sure your code lints using `pre-commit`.
-5. If you haven't already, complete the Contributor License Agreement ("CLA").
-6. Ensure your pull request follows the [conventional commits format](https://www.conventionalcommits.org/en/v1.0.0/).
-7. Ensure your pull request follows the [coding style](#coding-style).
-
-
-Please keep pull requests (PRs) small and focused. If you have a large set of changes, consider splitting them into logically grouped, smaller PRs to facilitate review and testing.
-
-```{tip}
-As a general guideline:
- Experienced contributors should try to keep no more than 5 open PRs at a time.
- New contributors are encouraged to have only one open PR at a time until they’re familiar with the codebase and process.
-```
-
-## Repository guidelines
-
-### Coding Style
-
-* Comments should provide meaningful insights into the code. Avoid filler comments that simply
-  describe the next step, as they create unnecessary clutter, same goes for docstrings.
-* Prefer comments to clarify surprising behavior and/or relationships between parts of the code
-  rather than explain what the next line of code does.
-* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like
-  `Exception`.
-* Error messages should be prefixed with "Failed to ..."
-* 4 spaces for indentation rather than tab
-* When using `# noqa` to suppress a style or linter warning, include a comment explaining the
-  justification for bypassing the check.
-* When using `# type: ignore` to suppress a mypy warning, include a comment explaining the
-  justification for bypassing the check.
-* Don't use unicode characters in the codebase. ASCII-only is preferred for compatibility or
-  readability reasons.
-* Providers configuration class should be Pydantic Field class. It should have a `description` field
-  that describes the configuration. These descriptions will be used to generate the provider
-  documentation.
-* When possible, use keyword arguments only when calling functions.
-* Llama Stack utilizes custom Exception classes for certain Resources that should be used where applicable.
-
-### License
-By contributing to Llama, you agree that your contributions will be licensed
-under the LICENSE file in the root directory of this source tree.
-
-## Common Tasks
-
-Some tips about common tasks you work on while contributing to Llama Stack:
-
-### Using `llama stack build`
-
-Building a stack image will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
-
-Example:
-```bash
-cd work/
-git clone https://github.com/meta-llama/llama-stack.git
-git clone https://github.com/meta-llama/llama-stack-client-python.git
-cd llama-stack
-LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --distro <...>
-```
-
-### Updating distribution configurations
-
-If you have made changes to a provider's configuration in any form (introducing a new config key, or
-changing models, etc.), you should run `./scripts/distro_codegen.py` to re-generate various YAML
-files as well as the documentation. You should not change `docs/source/.../distributions/` files
-manually as they are auto-generated.
-
-### Updating the provider documentation
-
-If you have made changes to a provider's configuration, you should run `./scripts/provider_codegen.py`
-to re-generate the documentation. You should not change `docs/source/.../providers/` files manually
-as they are auto-generated.
-Note that the provider "description" field will be used to generate the provider documentation.
-
-### Building the Documentation
-
-If you are making changes to the documentation at [https://llamastack.github.io/](https://llamastack.github.io/), you can use the following command to build the documentation and preview your changes.
-
-```bash
-# This rebuilds the documentation pages and the OpenAPI spec.
-npm install
-npm run gen-api-docs all
-npm run build
-
-# This will start a local server (usually at http://127.0.0.1:3000).
-npm run serve
-```
-
-### Update API Documentation
-
-If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command:
-
-```bash
-uv run ./docs/openapi_generator/run_openapi_generator.sh
-```
-
-The generated API schema will be available in `docs/static/`. Make sure to review the changes before committing.
-
-## Adding a New Provider
-
-See:
- [Adding a New API Provider Page](./new_api_provider.mdx) which describes how to add new API providers to the Stack.
- [Vector Database Page](./new_vector_database.mdx) which describes how to add a new vector databases with Llama Stack.
- [External Provider Page](/docs/providers/external/) which describes how to add external providers to the Stack.
-
-
-## Testing
-
-
-See the [Testing README](https://github.com/meta-llama/llama-stack/blob/main/tests/README.md) for detailed testing information.
-
-## Advanced Topics
-
-For developers who need deeper understanding of the testing system internals:
-
- [Record-Replay Testing](./testing/record-replay.mdx)
-
-### Benchmarking
-
-See the [Benchmarking README](https://github.com/meta-llama/llama-stack/blob/main/benchmarking/k8s-benchmark/README.md) for benchmarking information.
+<ReactMarkdown>{Contributing}</ReactMarkdown>
--- a/docs/docs/contributing/new_api_provider.mdx
+++ b/docs/docs/contributing/new_api_provider.mdx
@ -67,7 +67,7 @@ def get_base_url(self) -> str:

 ## Testing the Provider

-Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, you should install dependencies via `llama stack build --distro together`.
+Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, install its dependencies with `llama stack list-deps together | xargs -L1 uv pip install`.

 ### 1. Integration Testing

--- a/docs/docs/contributing/testing/record-replay.mdx
+++ b/docs/docs/contributing/testing/record-replay.mdx
@ -68,7 +68,9 @@ recordings/
 Direct API calls with no recording or replay:

 ```python
-with inference_recording(mode=InferenceMode.LIVE):
+from llama_stack.testing.api_recorder import api_recording, APIRecordingMode
+
+with api_recording(mode=APIRecordingMode.LIVE):
    response = await client.chat.completions.create(...)
 ```

@ -79,7 +81,7 @@ Use for initial development and debugging against real APIs.
 Captures API interactions while passing through real responses:

 ```python
-with inference_recording(mode=InferenceMode.RECORD, storage_dir="./recordings"):
+with api_recording(mode=APIRecordingMode.RECORD, storage_dir="./recordings"):
    response = await client.chat.completions.create(...)
    # Real API call made, response captured AND returned
 ```
@ -96,7 +98,7 @@ The recording process:
 Returns stored responses instead of making API calls:

 ```python
-with inference_recording(mode=InferenceMode.REPLAY, storage_dir="./recordings"):
+with api_recording(mode=APIRecordingMode.REPLAY, storage_dir="./recordings"):
    response = await client.chat.completions.create(...)
    # No API call made, cached response returned instantly
 ```
--- a/docs/docs/deploying/kubernetes_deployment.mdx
+++ b/docs/docs/deploying/kubernetes_deployment.mdx
@ -10,7 +10,7 @@ import TabItem from '@theme/TabItem';

 # Kubernetes Deployment Guide

-Deploy Llama Stack and vLLM servers in a Kubernetes cluster instead of running them locally. This guide covers both local development with Kind and production deployment on AWS EKS.
+Deploy Llama Stack and vLLM servers in a Kubernetes cluster instead of running them locally. This guide covers deployment using the Kubernetes operator to manage the Llama Stack server with Kind. The vLLM inference server is deployed manually.

 ## Prerequisites

@ -110,115 +110,176 @@ spec:
 EOF
 ```

-### Step 3: Configure Llama Stack
+### Step 3: Install Kubernetes Operator

-Update your run configuration:
-
-```yaml
-providers:
-  inference:
-  - provider_id: vllm
-    provider_type: remote::vllm
-    config:
-      url: http://vllm-server.default.svc.cluster.local:8000/v1
-      max_tokens: 4096
-      api_token: fake
-```
-
-Build container image:
+Install the Llama Stack Kubernetes operator to manage Llama Stack deployments:

 ```bash
-tmp_dir=$(mktemp -d) && cat >$tmp_dir/Containerfile.llama-stack-run-k8s <<EOF
-FROM distribution-myenv:dev
-RUN apt-get update && apt-get install -y git
-RUN git clone https://github.com/meta-llama/llama-stack.git /app/llama-stack-source
-ADD ./vllm-llama-stack-run-k8s.yaml /app/config.yaml
-EOF
-podman build -f $tmp_dir/Containerfile.llama-stack-run-k8s -t llama-stack-run-k8s $tmp_dir
+# Install from the latest main branch
+kubectl apply -f https://raw.githubusercontent.com/llamastack/llama-stack-k8s-operator/main/release/operator.yaml
+
+# Or install a specific version (e.g., v0.4.0)
+# kubectl apply -f https://raw.githubusercontent.com/llamastack/llama-stack-k8s-operator/v0.4.0/release/operator.yaml
 ```

-### Step 4: Deploy Llama Stack Server
+Verify the operator is running:
+
+```bash
+kubectl get pods -n llama-stack-operator-system
+```
+
+For more information about the operator, see the [llama-stack-k8s-operator repository](https://github.com/llamastack/llama-stack-k8s-operator).
+
+### Step 4: Deploy Llama Stack Server using Operator
+
+Create a `LlamaStackDistribution` custom resource to deploy the Llama Stack server. The operator will automatically create the necessary Deployment, Service, and other resources.
+You can optionally override the default `run.yaml` using `spec.server.userConfig` with a ConfigMap (see [userConfig spec](https://github.com/llamastack/llama-stack-k8s-operator/blob/main/docs/api-overview.md#userconfigspec)).

 ```yaml
 cat <<EOF | kubectl apply -f -
-apiVersion: v1
-kind: PersistentVolumeClaim
+apiVersion: llamastack.io/v1alpha1
+kind: LlamaStackDistribution
 metadata:
-  name: llama-pvc
-spec:
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 1Gi
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llama-stack-server
+  name: llamastack-vllm
 spec:
  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: llama-stack
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: llama-stack
-    spec:
-      containers:
-      - name: llama-stack
-        image: localhost/llama-stack-run-k8s:latest
-        imagePullPolicy: IfNotPresent
-        command: ["llama", "stack", "run", "/app/config.yaml"]
-        ports:
-          - containerPort: 5000
-        volumeMounts:
-          - name: llama-storage
-            mountPath: /root/.llama
-      volumes:
-      - name: llama-storage
-        persistentVolumeClaim:
-          claimName: llama-pvc
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: llama-stack-service
-spec:
-  selector:
-    app.kubernetes.io/name: llama-stack
-  ports:
-  - protocol: TCP
-    port: 5000
-    targetPort: 5000
-  type: ClusterIP
+  server:
+    distribution:
+      name: starter
+    containerSpec:
+      port: 8321
+      env:
+      - name: VLLM_URL
+        value: "http://vllm-server.default.svc.cluster.local:8000/v1"
+      - name: VLLM_MAX_TOKENS
+        value: "4096"
+      - name: VLLM_API_TOKEN
+        value: "fake"
+    # Optional: override run.yaml from a ConfigMap using userConfig
+    userConfig:
+      configMap:
+        name: llama-stack-config
+    storage:
+      size: "20Gi"
+      mountPath: "/home/lls/.lls"
 EOF
 ```

+**Configuration Options:**
+
+- `replicas`: Number of Llama Stack server instances to run
+- `server.distribution.name`: The distribution to use (e.g., `starter` for the starter distribution). See the [list of supported distributions](https://github.com/llamastack/llama-stack-k8s-operator/blob/main/distributions.json) in the operator repository.
+- `server.distribution.image`: (Optional) Custom container image for non-supported distributions. Use this field when deploying a distribution that is not in the supported list. If specified, this takes precedence over `name`.
+- `server.containerSpec.port`: Port on which the Llama Stack server listens (default: 8321)
+- `server.containerSpec.env`: Environment variables to configure providers:
+- `server.userConfig`: (Optional) Override the default `run.yaml` using a ConfigMap. See [userConfig spec](https://github.com/llamastack/llama-stack-k8s-operator/blob/main/docs/api-overview.md#userconfigspec).
+- `server.storage.size`: Size of the persistent volume for model and data storage
+- `server.storage.mountPath`: Where to mount the storage in the container
+
+**Note:** For a complete list of supported distributions, see [distributions.json](https://github.com/llamastack/llama-stack-k8s-operator/blob/main/distributions.json) in the operator repository. To use a custom or non-supported distribution, set the `server.distribution.image` field with your container image instead of  `server.distribution.name`.
+
+The operator automatically creates:
+- A Deployment for the Llama Stack server
+- A Service to access the server
+- A PersistentVolumeClaim for storage
+- All necessary RBAC resources
+
+
+Check the status of your deployment:
+
+```bash
+kubectl get llamastackdistribution
+kubectl describe llamastackdistribution llamastack-vllm
+```
+
 ### Step 5: Test Deployment

+Wait for the Llama Stack server pod to be ready:
+
 ```bash
-# Port forward and test
-kubectl port-forward service/llama-stack-service 5000:5000
-llama-stack-client --endpoint http://localhost:5000 inference chat-completion --message "hello, what model are you?"
+# Check the status of the LlamaStackDistribution
+kubectl get llamastackdistribution llamastack-vllm
+
+# Check the pods created by the operator
+kubectl get pods -l app.kubernetes.io/name=llama-stack
+
+# Wait for the pod to be ready
+kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=llama-stack --timeout=300s
+```
+
+Get the service name created by the operator (it typically follows the pattern `<llamastackdistribution-name>-service`):
+
+```bash
+# List services to find the service name
+kubectl get services | grep llamastack
+
+# Port forward and test (replace SERVICE_NAME with the actual service name)
+kubectl port-forward service/llamastack-vllm-service 8321:8321
+```
+
+In another terminal, test the deployment:
+
+```bash
+llama-stack-client --endpoint http://localhost:8321 inference chat-completion --message "hello, what model are you?"
 ```

 ## Troubleshooting

-**Check pod status:**
+### vLLM Server Issues
+
+**Check vLLM pod status:**
 ```bash
 kubectl get pods -l app.kubernetes.io/name=vllm
 kubectl logs -l app.kubernetes.io/name=vllm
 ```

-**Test service connectivity:**
+**Test vLLM service connectivity:**
 ```bash
 kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- curl http://vllm-server:8000/v1/models
 ```

+### Llama Stack Server Issues
+
+**Check LlamaStackDistribution status:**
+```bash
+# Get detailed status
+kubectl describe llamastackdistribution llamastack-vllm
+
+# Check for events
+kubectl get events --sort-by='.lastTimestamp' | grep llamastack-vllm
+```
+
+**Check operator-managed pods:**
+```bash
+# List all pods managed by the operator
+kubectl get pods -l app.kubernetes.io/name=llama-stack
+
+# Check pod logs (replace POD_NAME with actual pod name)
+kubectl logs -l app.kubernetes.io/name=llama-stack
+```
+
+**Check operator status:**
+```bash
+# Verify the operator is running
+kubectl get pods -n llama-stack-operator-system
+
+# Check operator logs if issues persist
+kubectl logs -n llama-stack-operator-system -l control-plane=controller-manager
+```
+
+**Verify service connectivity:**
+```bash
+# Get the service endpoint
+kubectl get svc llamastack-vllm-service
+
+# Test connectivity from within the cluster
+kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- curl http://llamastack-vllm-service:8321/health
+```
+
 ## Related Resources

 - **[Deployment Overview](/docs/deploying/)** - Overview of deployment options
 - **[Distributions](/docs/distributions)** - Understanding Llama Stack distributions
 - **[Configuration](/docs/distributions/configuration)** - Detailed configuration options
+- **[LlamaStack Operator](https://github.com/llamastack/llama-stack-k8s-operator)** - Overview of llama-stack kubernetes operator
+- **[LlamaStackDistribution](https://github.com/llamastack/llama-stack-k8s-operator/blob/main/docs/api-overview.md)** - API Spec of the llama-stack operator Custom Resource.
--- a/docs/docs/distributions/building_distro.mdx
+++ b/docs/docs/distributions/building_distro.mdx
@ -5,225 +5,80 @@ sidebar_label: Build your own Distribution
 sidebar_position: 3
 ---

-This guide will walk you through the steps to get started with building a Llama Stack distribution from scratch with your choice of API providers.
+This guide walks you through inspecting existing distributions, customising their configuration, and building runnable artefacts for your own deployment.

+### Explore existing distributions

-### Setting your log level
+All first-party distributions live under `llama_stack/distributions/`. Each directory contains:

-In order to specify the proper logging level users can apply the following environment variable `LLAMA_STACK_LOGGING` with the following format:
+- `build.yaml` – the distribution specification (providers, additional dependencies, optional external provider directories).
+- `run.yaml` – sample run configuration (when provided).
+- Documentation fragments that power this site.

-`LLAMA_STACK_LOGGING=server=debug;core=info`
-
-Where each category in the following list:
-
- all
- core
- server
- router
- inference
- agents
- safety
- eval
- tools
- client
-
-Can be set to any of the following log levels:
-
- debug
- info
- warning
- error
- critical
-
-The default global log level is `info`. `all` sets the log level for all components.
-
-A user can also set `LLAMA_STACK_LOG_FILE` which will pipe the logs to the specified path as well as to the terminal. An example would be: `export LLAMA_STACK_LOG_FILE=server.log`
-
-### Llama Stack Build
-
-In order to build your own distribution, we recommend you clone the `llama-stack` repository.
-
-
-```
-git clone git@github.com:meta-llama/llama-stack.git
-cd llama-stack
-pip install -e .
-```
-Use the CLI to build your distribution.
-The main points to consider are:
-1. **Image Type** - Do you want a venv environment or a Container (eg. Docker)
-2. **Template** - Do you want to use a template to build your distribution? or start from scratch ?
-3. **Config** - Do you want to use a pre-existing config file to build your distribution?
-
-```
-llama stack build -h
-usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--distro DISTRIBUTION] [--list-distros] [--image-type {container,venv}] [--image-name IMAGE_NAME] [--print-deps-only]
-                         [--run] [--providers PROVIDERS]
-
-Build a Llama stack container
-
-options:
-  -h, --help            show this help message and exit
-  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will be prompted to
-                        enter information interactively (default: None)
-  --template TEMPLATE   (deprecated) Name of the example template config to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default:
-                        None)
-  --distro DISTRIBUTION, --distribution DISTRIBUTION
-                        Name of the distribution to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default: None)
-  --list-distros, --list-distributions
-                        Show the available distributions for building a Llama Stack distribution (default: False)
-  --image-type {container,venv}
-                        Image Type to use for the build. If not specified, will use the image type from the template config. (default: None)
-  --image-name IMAGE_NAME
-                        [for image-type=container|venv] Name of the virtual environment to use for the build. If not specified, currently active environment will be used if found. (default:
-                        None)
-  --print-deps-only     Print the dependencies for the stack only, without building the stack (default: False)
-  --run                 Run the stack after building using the same image type, name, and other applicable arguments (default: False)
-  --providers PROVIDERS
-                        Build a config for a list of providers and only those providers. This list is formatted like: api1=provider1,api2=provider2. Where there can be multiple providers per
-                        API. (default: None)
-```
-
-After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.
+Browse that folder to understand available providers and copy a distribution to use as a starting point. When creating a new stack, duplicate an existing directory, rename it, and adjust the `build.yaml` file to match your requirements.

 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

 <Tabs>
-<TabItem value="template" label="Building from a template">
-To build from alternative API providers, we provide distribution templates for users to get started building a distribution backed by different providers.
+<TabItem value="container" label="Building a container">

-The following command will allow you to see the available templates and their corresponding providers.
-```
-llama stack build --list-templates
+Use the Containerfile at `containers/Containerfile`, which installs `llama-stack`, resolves distribution dependencies via `llama stack list-deps`, and sets the entrypoint to `llama stack run`.
+
+```bash
+docker build . \
+  -f containers/Containerfile \
+  --build-arg DISTRO_NAME=starter \
+  --tag llama-stack:starter
 ```

-```
------------------------------+-----------------------------------------------------------------------------+
-| Template Name                | Description                                                                 |
-+------------------------------+-----------------------------------------------------------------------------+
-| watsonx                      | Use watsonx for running LLM inference                                       |
-+------------------------------+-----------------------------------------------------------------------------+
-| vllm-gpu                     | Use a built-in vLLM engine for running LLM inference                        |
-+------------------------------+-----------------------------------------------------------------------------+
-| together                     | Use Together.AI for running LLM inference                                   |
-+------------------------------+-----------------------------------------------------------------------------+
-| tgi                          | Use (an external) TGI server for running LLM inference                      |
-+------------------------------+-----------------------------------------------------------------------------+
-| starter                      | Quick start template for running Llama Stack with several popular providers |
-+------------------------------+-----------------------------------------------------------------------------+
-| sambanova                    | Use SambaNova for running LLM inference and safety                          |
-+------------------------------+-----------------------------------------------------------------------------+
-| remote-vllm                  | Use (an external) vLLM server for running LLM inference                     |
-+------------------------------+-----------------------------------------------------------------------------+
-| postgres-demo                | Quick start template for running Llama Stack with several popular providers |
-+------------------------------+-----------------------------------------------------------------------------+
-| passthrough                  | Use Passthrough hosted llama-stack endpoint for LLM inference               |
-+------------------------------+-----------------------------------------------------------------------------+
-| open-benchmark               | Distribution for running open benchmarks                                    |
-+------------------------------+-----------------------------------------------------------------------------+
-| ollama                       | Use (an external) Ollama server for running LLM inference                   |
-+------------------------------+-----------------------------------------------------------------------------+
-| nvidia                       | Use NVIDIA NIM for running LLM inference, evaluation and safety             |
-+------------------------------+-----------------------------------------------------------------------------+
-| meta-reference-gpu           | Use Meta Reference for running LLM inference                                |
-+------------------------------+-----------------------------------------------------------------------------+
-| llama_api                    | Distribution for running e2e tests in CI                                    |
-+------------------------------+-----------------------------------------------------------------------------+
-| hf-serverless                | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
-+------------------------------+-----------------------------------------------------------------------------+
-| hf-endpoint                  | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
-+------------------------------+-----------------------------------------------------------------------------+
-| groq                         | Use Groq for running LLM inference                                          |
-+------------------------------+-----------------------------------------------------------------------------+
-| fireworks                    | Use Fireworks.AI for running LLM inference                                  |
-+------------------------------+-----------------------------------------------------------------------------+
-| experimental-post-training   | Experimental template for post training                                     |
-+------------------------------+-----------------------------------------------------------------------------+
-| dell                         | Dell's distribution of Llama Stack. TGI inference via Dell's custom         |
-|                              | container                                                                   |
-+------------------------------+-----------------------------------------------------------------------------+
-| ci-tests                     | Distribution for running e2e tests in CI                                    |
-+------------------------------+-----------------------------------------------------------------------------+
-| cerebras                     | Use Cerebras for running LLM inference                                      |
-+------------------------------+-----------------------------------------------------------------------------+
-| bedrock                      | Use AWS Bedrock for running LLM inference and safety                        |
-+------------------------------+-----------------------------------------------------------------------------+
-```
+Handy build arguments:

-You may then pick a template to build your distribution with providers fitted to your liking.
+- `DISTRO_NAME` – distribution directory name (defaults to `starter`).
+- `RUN_CONFIG_PATH` – absolute path inside the build context for a run config that should be baked into the image (e.g. `/workspace/run.yaml`).
+- `INSTALL_MODE=editable` – install the repository copied into `/workspace` with `uv pip install -e`. Pair it with `--build-arg LLAMA_STACK_DIR=/workspace`.
+- `LLAMA_STACK_CLIENT_DIR` – optional editable install of the Python client.
+- `PYPI_VERSION` / `TEST_PYPI_VERSION` – pin specific releases when not using editable installs.
+- `KEEP_WORKSPACE=1` – retain `/workspace` in the final image if you need to access additional files (such as sample configs or provider bundles).

-For example, to build a distribution with TGI as the inference provider, you can run:
-```
-$ llama stack build --distro starter
-...
-You can now edit ~/.llama/distributions/llamastack-starter/starter-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-starter/starter-run.yaml`
-```
+Make sure any custom `build.yaml`, run configs, or provider directories you reference are included in the Docker build context so the Containerfile can read them.

-```{tip}
-The generated `run.yaml` file is a starting point for your configuration. For comprehensive guidance on customizing it for your specific needs, infrastructure, and deployment scenarios, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
-```
 </TabItem>
-<TabItem value="scratch" label="Building from Scratch">
+<TabItem value="external" label="Building with external providers">

-If the provided templates do not fit your use case, you could start off with running `llama stack build` which will allow you to a interactively enter wizard where you will be prompted to enter build configurations.
+External providers live outside the main repository but can be bundled by pointing `external_providers_dir` to a directory that contains your provider packages.

-It would be best to start with a template and understand the structure of the config file and the various concepts ( APIS, providers, resources, etc.) before starting from scratch.
-```
-llama stack build
+1. Copy providers into the build context, for example `cp -R path/to/providers providers.d`.
+2. Update `build.yaml` with the directory and provider entries.
+3. Adjust run configs to use the in-container path (usually `/.llama/providers.d`). Pass `--build-arg RUN_CONFIG_PATH=/workspace/run.yaml` if you want to bake the config.

-> Enter a name for your Llama Stack (e.g. my-local-stack): my-stack
-> Enter the image type you want your Llama Stack to be built as (container or venv): venv
-
-Llama Stack is composed of several APIs working together. Let's select
-the provider types (implementations) you want to use for these APIs.
-
-Tip: use <TAB> to see options for the providers.
-
-> Enter provider for API inference: inline::meta-reference
-> Enter provider for API safety: inline::llama-guard
-> Enter provider for API agents: inline::meta-reference
-> Enter provider for API memory: inline::faiss
-> Enter provider for API datasetio: inline::meta-reference
-> Enter provider for API scoring: inline::meta-reference
-> Enter provider for API eval: inline::meta-reference
-> Enter provider for API telemetry: inline::meta-reference
-
- > (Optional) Enter a short description for your Llama Stack:
-
-You can now edit ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml`
-```
-</TabItem>
-<TabItem value="config" label="Building from a pre-existing build config file">
- In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
-
- The config file will be of contents like the ones in `llama_stack/distributions/*build.yaml`.
-
-```
-llama stack build --config llama_stack/distributions/starter/build.yaml
-```
-</TabItem>
-<TabItem value="external" label="Building with External Providers">
-
-Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently or use community-provided providers.
-
-To build a distribution with external providers, you need to:
-
-1. Configure the `external_providers_dir` in your build configuration file:
+Example `build.yaml` excerpt for a custom Ollama provider:

 ```yaml
-# Example my-external-stack.yaml with external providers
-version: '2'
 distribution_spec:
-  description: Custom distro for CI tests
  providers:
    inference:
      - remote::custom_ollama
-# Add more providers as needed
-image_type: container
-image_name: ci-test
-# Path to external provider implementations
-external_providers_dir: ~/.llama/providers.d
+external_providers_dir: /workspace/providers.d
+```
+
+Inside `providers.d/custom_ollama/provider.py`, define `get_provider_spec()` so the CLI can discover dependencies:
+
+```python
+from llama_stack_api.providers.datatypes import ProviderSpec
+
+
+def get_provider_spec() -> ProviderSpec:
+    return ProviderSpec(
+        provider_type="remote::custom_ollama",
+        module="llama_stack_ollama_provider",
+        config_class="llama_stack_ollama_provider.config.OllamaImplConfig",
+        pip_packages=[
+            "ollama",
+            "aiohttp",
+            "llama-stack-provider-ollama",
+        ],
+    )
 ```

 Here's an example for a custom Ollama provider:
@ -245,53 +100,22 @@ The `pip_packages` section lists the Python packages required by the provider, a
 provider package itself. The package must be available on PyPI or can be provided from a local
 directory or a git repository (git must be installed on the build environment).

-2. Build your distribution using the config file:
+For deeper guidance, see the [External Providers documentation](../providers/external/).

-```
-llama stack build --config my-external-stack.yaml
-```
-
-For more information on external providers, including directory structure, provider types, and implementation requirements, see the [External Providers documentation](../providers/external/).
 </TabItem>
-<TabItem value="container" label="Building Container">
+</Tabs>

-:::tip Podman Alternative
-Podman is supported as an alternative to Docker. Set `CONTAINER_BINARY` to `podman` in your environment to use Podman.
-:::
+### Run your stack server

-To build a container image, you may start off from a template and use the `--image-type container` flag to specify `container` as the build image type.
-
-```
-llama stack build --distro starter --image-type container
-```
-
-```
-$ llama stack build --distro starter --image-type container
-...
-Containerfile created successfully in /tmp/tmp.viA3a3Rdsg/ContainerfileFROM python:3.10-slim
-...
-```
-
-You can now edit ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml and run `llama stack run ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml`
-```
-
-Now set some environment variables for the inference model ID and Llama Stack Port and create a local directory to mount into the container's file system.
+After building the image, launch it directly with Docker or Podman—the entrypoint calls `llama stack run` using the baked distribution or the bundled run config:

 ```bash
-export INFERENCE_MODEL="llama3.2:3b"
-export LLAMA_STACK_PORT=8321
-mkdir -p ~/.llama
-```
-
-After this step is successful, you should be able to find the built container image and test it with the below Docker command:
-
-```
 docker run -d \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  -e INFERENCE_MODEL=$INFERENCE_MODEL \
  -e OLLAMA_URL=http://host.docker.internal:11434 \
-  localhost/distribution-ollama:dev \
+  llama-stack:starter \
  --port $LLAMA_STACK_PORT
 ```

@ -311,131 +135,14 @@ Here are the docker flags and their uses:

 * `--port $LLAMA_STACK_PORT`: Port number for the server to listen on

-</TabItem>
-</Tabs>


-### Running your Stack server
-Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack build` step.
+If you prepared a custom run config, mount it into the container and reference it explicitly:

+```bash
+docker run \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v $(pwd)/run.yaml:/app/run.yaml \
+  llama-stack:starter \
+  /app/run.yaml
 ```
-llama stack run -h
-usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME]
-                       [--image-type {venv}] [--enable-ui]
-                       [config | distro]
-
-Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
-
-positional arguments:
-  config | distro       Path to config file to use for the run or name of known distro (`llama stack list` for a list). (default: None)
-
-options:
-  -h, --help            show this help message and exit
-  --port PORT           Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
-  --image-name IMAGE_NAME
-                        [DEPRECATED] This flag is no longer supported. Please activate your virtual environment before running. (default: None)
-  --image-type {venv}
-                        [DEPRECATED] This flag is no longer supported. Please activate your virtual environment before running. (default: None)
-  --enable-ui           Start the UI server (default: False)
-```
-
-**Note:** Container images built with `llama stack build --image-type container` cannot be run using `llama stack run`. Instead, they must be run directly using Docker or Podman commands as shown in the container building section above.
-
-```
-# Start using template name
-llama stack run tgi
-
-# Start using config file
-llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
-```
-
-```
-$ llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
-
-Serving API inspect
- GET /health
- GET /providers/list
- GET /routes/list
-Serving API inference
- POST /inference/chat_completion
- POST /inference/completion
- POST /inference/embeddings
-...
-Serving API agents
- POST /agents/create
- POST /agents/session/create
- POST /agents/turn/create
- POST /agents/delete
- POST /agents/session/delete
- POST /agents/session/get
- POST /agents/step/get
- POST /agents/turn/get
-
-Listening on ['::', '0.0.0.0']:8321
-INFO:     Started server process [2935911]
-INFO:     Waiting for application startup.
-INFO:     Application startup complete.
-INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
-INFO:     2401:db00:35c:2d2b:face:0:c9:0:54678 - "GET /models/list HTTP/1.1" 200 OK
-```
-
-### Listing Distributions
-Using the list command, you can view all existing Llama Stack distributions, including stacks built from templates, from scratch, or using custom configuration files.
-
-```
-llama stack list -h
-usage: llama stack list [-h]
-
-list the build stacks
-
-options:
-  -h, --help  show this help message and exit
-```
-
-Example Usage
-
-```
-llama stack list
-```
-
-```
------------------------------+-----------------------------------------------------------------+--------------+------------+
-| Stack Name                  | Path                                                            | Build Config | Run Config |
-+------------------------------+-----------------------------------------------------------------------------+--------------+
-| together                    | ~/.llama/distributions/together                                 | Yes          | No         |
-+------------------------------+-----------------------------------------------------------------------------+--------------+
-| bedrock                     | ~/.llama/distributions/bedrock                                  | Yes          | No         |
-+------------------------------+-----------------------------------------------------------------------------+--------------+
-| starter                     | ~/.llama/distributions/starter                                  | Yes          | Yes        |
-+------------------------------+-----------------------------------------------------------------------------+--------------+
-| remote-vllm                 | ~/.llama/distributions/remote-vllm                              | Yes          | Yes        |
-+------------------------------+-----------------------------------------------------------------------------+--------------+
-```
-
-### Removing a Distribution
-Use the remove command to delete a distribution you've previously built.
-
-```
-llama stack rm -h
-usage: llama stack rm [-h] [--all] [name]
-
-Remove the build stack
-
-positional arguments:
-  name        Name of the stack to delete (default: None)
-
-options:
-  -h, --help  show this help message and exit
-  --all, -a   Delete all stacks (use with caution) (default: False)
-```
-
-Example
-```
-llama stack rm llamastack-test
-```
-
-To keep your environment organized and avoid clutter, consider using `llama stack list` to review old or unused distributions and `llama stack rm <name>` to delete them when they're no longer needed.
-
-### Troubleshooting
-
-If you encounter any issues, ask questions in our discord or search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue.
--- a/docs/docs/distributions/configuration.mdx
+++ b/docs/docs/distributions/configuration.mdx
@ -21,7 +21,6 @@ apis:
 - inference
 - vector_io
 - safety
- telemetry
 providers:
  inference:
  - provider_id: ollama
@ -44,18 +43,36 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/agents_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config: {}
-metadata_store:
-  namespace: null
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/registry.db
+      persistence:
+        agent_state:
+          backend: kv_default
+          namespace: agents
+        responses:
+          backend: sql_default
+          table_name: responses
+storage:
+  backends:
+    kv_default:
+      type: kv_sqlite
+      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/kvstore.db
+    sql_default:
+      type: sql_sqlite
+      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/sqlstore.db
+  stores:
+    metadata:
+      backend: kv_default
+      namespace: registry
+    inference:
+      backend: sql_default
+      table_name: inference_store
+      max_write_queue_size: 10000
+      num_writers: 4
+    conversations:
+      backend: sql_default
+      table_name: openai_conversations
+    prompts:
+      backend: kv_default
+      namespace: prompts
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
@ -78,7 +95,6 @@ apis:
 - inference
 - vector_io
 - safety
- telemetry
 ```

 ## Providers
@ -205,7 +221,15 @@ models:
 ```
 A Model is an instance of a "Resource" (see [Concepts](../concepts/)) and is associated with a specific inference provider (in this case, the provider with identifier `ollama`). This is an instance of a "pre-registered" model. While we always encourage the clients to register models before using them, some Stack servers may come up a list of "already known and available" models.

-What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. Contrast it with `model_id` which is the identifier for the same model for Llama Stack's purposes. For example, you may want to name "llama3.2:vision-11b" as "image_captioning_model" when you use it in your Stack interactions. When omitted, the server will set `provider_model_id` to be the same as `model_id`.
+What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. The `model_id` field is provided for configuration purposes but is not used as part of the model identifier.
+
+**Important:** Models are identified as `provider_id/provider_model_id` in the system and when making API calls. When `provider_model_id` is omitted, the server will set it to be the same as `model_id`.
+
+Examples:
+- Config: `model_id: llama3.2`, `provider_id: ollama`, `provider_model_id: null`
+  → Access as: `ollama/llama3.2`
+- Config: `model_id: my-llama`, `provider_id: vllm-inference`, `provider_model_id: llama-3-2-3b`
+  → Access as: `vllm-inference/llama-3-2-3b` (the `model_id` is not used in the identifier)

 If you need to conditionally register a model in the configuration, such as only when specific environment variable(s) are set, this can be accomplished by utilizing a special `__disabled__` string as the default value of an environment variable substitution, as shown below:

@ -575,24 +599,13 @@ created by users sharing a team with them:

 In addition to resource-based access control, Llama Stack supports endpoint-level authorization using OAuth 2.0 style scopes. When authentication is enabled, specific API endpoints require users to have particular scopes in their authentication token.

-**Scope-Gated APIs:**
-The following APIs are currently gated by scopes:
-
- **Telemetry API** (scope: `telemetry.read`):
-  - `POST /telemetry/traces` - Query traces
-  - `GET /telemetry/traces/{trace_id}` - Get trace by ID
-  - `GET /telemetry/traces/{trace_id}/spans/{span_id}` - Get span by ID
-  - `POST /telemetry/spans/{span_id}/tree` - Get span tree
-  - `POST /telemetry/spans` - Query spans
-  - `POST /telemetry/metrics/{metric_name}` - Query metrics
-
 **Authentication Configuration:**

 For **JWT/OAuth2 providers**, scopes should be included in the JWT's claims:
 ```json
 {
  "sub": "user123",
-  "scope": "telemetry.read",
+  "scope": "<scope>",
  "aud": "llama-stack"
 }
 ```
@ -602,7 +615,7 @@ For **custom authentication providers**, the endpoint must return user attribute
 {
  "principal": "user123",
  "attributes": {
-    "scopes": ["telemetry.read"]
+    "scopes": ["<scope>"]
  }
 }
 ```
--- a/docs/docs/distributions/importing_as_library.mdx
+++ b/docs/docs/distributions/importing_as_library.mdx
@ -11,8 +11,8 @@ If you are planning to use an external service for Inference (even Ollama or TGI
 This avoids the overhead of setting up a server.
 ```bash
 # setup
-uv pip install llama-stack
-llama stack build --distro starter --image-type venv
+uv pip install llama-stack llama-stack-client
+llama stack list-deps starter | xargs -L1 uv pip install
 ```

 ```python
--- a/docs/docs/distributions/index.mdx
+++ b/docs/docs/distributions/index.mdx
@ -19,3 +19,4 @@ This section provides an overview of the distributions available in Llama Stack.
 - **[Starting Llama Stack Server](./starting_llama_stack_server.mdx)** - How to run distributions
 - **[Importing as Library](./importing_as_library.mdx)** - Use distributions in your code
 - **[Configuration Reference](./configuration.mdx)** - Configuration file format details
+- **[Llama Stack UI](./llama_stack_ui.mdx)** - Web-based user interface for interacting with Llama Stack servers
--- a/docs/docs/distributions/k8s/stack-configmap.yaml
+++ b/docs/docs/distributions/k8s/stack-configmap.yaml
@ -1,56 +1,163 @@
 apiVersion: v1
 data:
-  stack_run_config.yaml: "version: '2'\nimage_name: kubernetes-demo\napis:\n- agents\n-
-    inference\n- files\n- safety\n- telemetry\n- tool_runtime\n- vector_io\nproviders:\n
-    \ inference:\n  - provider_id: vllm-inference\n    provider_type: remote::vllm\n
-    \   config:\n      url: ${env.VLLM_URL:=http://localhost:8000/v1}\n      max_tokens:
-    ${env.VLLM_MAX_TOKENS:=4096}\n      api_token: ${env.VLLM_API_TOKEN:=fake}\n      tls_verify:
-    ${env.VLLM_TLS_VERIFY:=true}\n  - provider_id: vllm-safety\n    provider_type:
-    remote::vllm\n    config:\n      url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}\n
-    \     max_tokens: ${env.VLLM_MAX_TOKENS:=4096}\n      api_token: ${env.VLLM_API_TOKEN:=fake}\n
-    \     tls_verify: ${env.VLLM_TLS_VERIFY:=true}\n  - provider_id: sentence-transformers\n
-    \   provider_type: inline::sentence-transformers\n    config: {}\n  vector_io:\n
-    \ - provider_id: ${env.ENABLE_CHROMADB:+chromadb}\n    provider_type: remote::chromadb\n
-    \   config:\n      url: ${env.CHROMADB_URL:=}\n      kvstore:\n        type: postgres\n
-    \       host: ${env.POSTGRES_HOST:=localhost}\n        port: ${env.POSTGRES_PORT:=5432}\n
-    \       db: ${env.POSTGRES_DB:=llamastack}\n        user: ${env.POSTGRES_USER:=llamastack}\n
-    \       password: ${env.POSTGRES_PASSWORD:=llamastack}\n  files:\n  - provider_id:
-    meta-reference-files\n    provider_type: inline::localfs\n    config:\n      storage_dir:
-    ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}\n      metadata_store:\n
-    \       type: sqlite\n        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
-    \ \n  safety:\n  - provider_id: llama-guard\n    provider_type: inline::llama-guard\n
-    \   config:\n      excluded_categories: []\n  agents:\n  - provider_id: meta-reference\n
-    \   provider_type: inline::meta-reference\n    config:\n      persistence_store:\n
-    \       type: postgres\n        host: ${env.POSTGRES_HOST:=localhost}\n        port:
-    ${env.POSTGRES_PORT:=5432}\n        db: ${env.POSTGRES_DB:=llamastack}\n        user:
-    ${env.POSTGRES_USER:=llamastack}\n        password: ${env.POSTGRES_PASSWORD:=llamastack}\n
-    \     responses_store:\n        type: postgres\n        host: ${env.POSTGRES_HOST:=localhost}\n
-    \       port: ${env.POSTGRES_PORT:=5432}\n        db: ${env.POSTGRES_DB:=llamastack}\n
-    \       user: ${env.POSTGRES_USER:=llamastack}\n        password: ${env.POSTGRES_PASSWORD:=llamastack}\n
-    \ telemetry:\n  - provider_id: meta-reference\n    provider_type: inline::meta-reference\n
-    \   config:\n      service_name: \"${env.OTEL_SERVICE_NAME:=\\u200B}\"\n      sinks:
-    ${env.TELEMETRY_SINKS:=console}\n  tool_runtime:\n  - provider_id: brave-search\n
-    \   provider_type: remote::brave-search\n    config:\n      api_key: ${env.BRAVE_SEARCH_API_KEY:+}\n
-    \     max_results: 3\n  - provider_id: tavily-search\n    provider_type: remote::tavily-search\n
-    \   config:\n      api_key: ${env.TAVILY_SEARCH_API_KEY:+}\n      max_results:
-    3\n  - provider_id: rag-runtime\n    provider_type: inline::rag-runtime\n    config:
-    {}\n  - provider_id: model-context-protocol\n    provider_type: remote::model-context-protocol\n
-    \   config: {}\nmetadata_store:\n  type: postgres\n  host: ${env.POSTGRES_HOST:=localhost}\n
-    \ port: ${env.POSTGRES_PORT:=5432}\n  db: ${env.POSTGRES_DB:=llamastack}\n  user:
-    ${env.POSTGRES_USER:=llamastack}\n  password: ${env.POSTGRES_PASSWORD:=llamastack}\n
-    \ table_name: llamastack_kvstore\ninference_store:\n  type: postgres\n  host:
-    ${env.POSTGRES_HOST:=localhost}\n  port: ${env.POSTGRES_PORT:=5432}\n  db: ${env.POSTGRES_DB:=llamastack}\n
-    \ user: ${env.POSTGRES_USER:=llamastack}\n  password: ${env.POSTGRES_PASSWORD:=llamastack}\nmodels:\n-
-    metadata:\n    embedding_dimension: 384\n  model_id: all-MiniLM-L6-v2\n  provider_id:
-    sentence-transformers\n  model_type: embedding\n- metadata: {}\n  model_id: ${env.INFERENCE_MODEL}\n
-    \ provider_id: vllm-inference\n  model_type: llm\n- metadata: {}\n  model_id:
-    ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\n  provider_id: vllm-safety\n
-    \ model_type: llm\nshields:\n- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\nvector_dbs:
-    []\ndatasets: []\nscoring_fns: []\nbenchmarks: []\ntool_groups:\n- toolgroup_id:
-    builtin::websearch\n  provider_id: tavily-search\n- toolgroup_id: builtin::rag\n
-    \ provider_id: rag-runtime\nserver:\n  port: 8321\n  auth:\n    provider_config:\n
-    \     type: github_token\n"
+  stack_run_config.yaml: |
+    version: '2'
+    image_name: kubernetes-demo
+    apis:
+    - agents
+    - inference
+    - files
+    - safety
+    - telemetry
+    - tool_runtime
+    - vector_io
+    providers:
+      inference:
+      - provider_id: vllm-inference
+        provider_type: remote::vllm
+        config:
+          url: ${env.VLLM_URL:=http://localhost:8000/v1}
+          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
+          api_token: ${env.VLLM_API_TOKEN:=fake}
+          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
+      - provider_id: vllm-safety
+        provider_type: remote::vllm
+        config:
+          url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
+          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
+          api_token: ${env.VLLM_API_TOKEN:=fake}
+          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
+      - provider_id: sentence-transformers
+        provider_type: inline::sentence-transformers
+        config: {}
+      vector_io:
+      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
+        provider_type: remote::chromadb
+        config:
+          url: ${env.CHROMADB_URL:=}
+          kvstore:
+            type: postgres
+            host: ${env.POSTGRES_HOST:=localhost}
+            port: ${env.POSTGRES_PORT:=5432}
+            db: ${env.POSTGRES_DB:=llamastack}
+            user: ${env.POSTGRES_USER:=llamastack}
+            password: ${env.POSTGRES_PASSWORD:=llamastack}
+      files:
+      - provider_id: meta-reference-files
+        provider_type: inline::localfs
+        config:
+          storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+          metadata_store:
+            type: sqlite
+            db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+      safety:
+      - provider_id: llama-guard
+        provider_type: inline::llama-guard
+        config:
+          excluded_categories: []
+      agents:
+      - provider_id: meta-reference
+        provider_type: inline::meta-reference
+        config:
+          persistence_store:
+            type: postgres
+            host: ${env.POSTGRES_HOST:=localhost}
+            port: ${env.POSTGRES_PORT:=5432}
+            db: ${env.POSTGRES_DB:=llamastack}
+            user: ${env.POSTGRES_USER:=llamastack}
+            password: ${env.POSTGRES_PASSWORD:=llamastack}
+          responses_store:
+            type: postgres
+            host: ${env.POSTGRES_HOST:=localhost}
+            port: ${env.POSTGRES_PORT:=5432}
+            db: ${env.POSTGRES_DB:=llamastack}
+            user: ${env.POSTGRES_USER:=llamastack}
+            password: ${env.POSTGRES_PASSWORD:=llamastack}
+      telemetry:
+      - provider_id: meta-reference
+        provider_type: inline::meta-reference
+        config:
+          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
+          sinks: ${env.TELEMETRY_SINKS:=console}
+      tool_runtime:
+      - provider_id: brave-search
+        provider_type: remote::brave-search
+        config:
+          api_key: ${env.BRAVE_SEARCH_API_KEY:+}
+          max_results: 3
+      - provider_id: tavily-search
+        provider_type: remote::tavily-search
+        config:
+          api_key: ${env.TAVILY_SEARCH_API_KEY:+}
+          max_results: 3
+      - provider_id: rag-runtime
+        provider_type: inline::rag-runtime
+        config: {}
+      - provider_id: model-context-protocol
+        provider_type: remote::model-context-protocol
+        config: {}
+    storage:
+      backends:
+        kv_default:
+          type: kv_postgres
+          host: ${env.POSTGRES_HOST:=localhost}
+          port: ${env.POSTGRES_PORT:=5432}
+          db: ${env.POSTGRES_DB:=llamastack}
+          user: ${env.POSTGRES_USER:=llamastack}
+          password: ${env.POSTGRES_PASSWORD:=llamastack}
+          table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
+        sql_default:
+          type: sql_postgres
+          host: ${env.POSTGRES_HOST:=localhost}
+          port: ${env.POSTGRES_PORT:=5432}
+          db: ${env.POSTGRES_DB:=llamastack}
+          user: ${env.POSTGRES_USER:=llamastack}
+          password: ${env.POSTGRES_PASSWORD:=llamastack}
+      stores:
+        metadata:
+          backend: kv_default
+          namespace: registry
+        inference:
+          backend: sql_default
+          table_name: inference_store
+          max_write_queue_size: 10000
+          num_writers: 4
+        conversations:
+          backend: sql_default
+          table_name: openai_conversations
+        prompts:
+          backend: kv_default
+          namespace: prompts
+    models:
+    - metadata:
+        embedding_dimension: 768
+      model_id: nomic-embed-text-v1.5
+      provider_id: sentence-transformers
+      model_type: embedding
+    - metadata: {}
+      model_id: ${env.INFERENCE_MODEL}
+      provider_id: vllm-inference
+      model_type: llm
+    - metadata: {}
+      model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
+      provider_id: vllm-safety
+      model_type: llm
+    shields:
+    - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
+    vector_dbs: []
+    datasets: []
+    scoring_fns: []
+    benchmarks: []
+    tool_groups:
+    - toolgroup_id: builtin::websearch
+      provider_id: tavily-search
+    - toolgroup_id: builtin::rag
+      provider_id: rag-runtime
+    server:
+      port: 8321
+      auth:
+        provider_config:
+          type: github_token
 kind: ConfigMap
 metadata:
-  creationTimestamp: null
  name: llama-stack-config
--- a/docs/docs/distributions/k8s/stack_run_config.yaml
+++ b/docs/docs/distributions/k8s/stack_run_config.yaml
@ -5,7 +5,6 @@ apis:
 - inference
 - files
 - safety
- telemetry
 - tool_runtime
 - vector_io
 providers:
@ -32,21 +31,17 @@ providers:
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
-      kvstore:
-        type: postgres
-        host: ${env.POSTGRES_HOST:=localhost}
-        port: ${env.POSTGRES_PORT:=5432}
-        db: ${env.POSTGRES_DB:=llamastack}
-        user: ${env.POSTGRES_USER:=llamastack}
-        password: ${env.POSTGRES_PASSWORD:=llamastack}
+      persistence:
+        namespace: vector_io::chroma_remote
+        backend: kv_default
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
      metadata_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+        table_name: files_metadata
+        backend: sql_default
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
@ -56,26 +51,15 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      persistence_store:
-        type: postgres
-        host: ${env.POSTGRES_HOST:=localhost}
-        port: ${env.POSTGRES_PORT:=5432}
-        db: ${env.POSTGRES_DB:=llamastack}
-        user: ${env.POSTGRES_USER:=llamastack}
-        password: ${env.POSTGRES_PASSWORD:=llamastack}
-      responses_store:
-        type: postgres
-        host: ${env.POSTGRES_HOST:=localhost}
-        port: ${env.POSTGRES_PORT:=5432}
-        db: ${env.POSTGRES_DB:=llamastack}
-        user: ${env.POSTGRES_USER:=llamastack}
-        password: ${env.POSTGRES_PASSWORD:=llamastack}
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console}
+      persistence:
+        agent_state:
+          namespace: agents
+          backend: kv_default
+        responses:
+          table_name: responses
+          backend: sql_default
+          max_write_queue_size: 10000
+          num_writers: 4
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
@ -93,48 +77,73 @@ providers:
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
    config: {}
-metadata_store:
-  type: postgres
+storage:
+  backends:
+    kv_default:
+      type: kv_postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
-  table_name: llamastack_kvstore
-inference_store:
-  type: postgres
+      table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
+    sql_default:
+      type: sql_postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
-models:
- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
+  stores:
+    metadata:
+      namespace: registry
+      backend: kv_default
+    inference:
+      table_name: inference_store
+      backend: sql_default
+      max_write_queue_size: 10000
+      num_writers: 4
+    conversations:
+      table_name: openai_conversations
+      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
+registered_resources:
+  models:
+  - metadata:
+      embedding_dimension: 768
+    model_id: nomic-embed-text-v1.5
    provider_id: sentence-transformers
    model_type: embedding
- metadata: {}
+  - metadata: {}
    model_id: ${env.INFERENCE_MODEL}
    provider_id: vllm-inference
    model_type: llm
- metadata: {}
+  - metadata: {}
    model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
    provider_id: vllm-safety
    model_type: llm
-shields:
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
- toolgroup_id: builtin::websearch
+  shields:
+  - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
+  vector_dbs: []
+  datasets: []
+  scoring_fns: []
+  benchmarks: []
+  tool_groups:
+  - toolgroup_id: builtin::websearch
    provider_id: tavily-search
- toolgroup_id: builtin::rag
+  - toolgroup_id: builtin::rag
    provider_id: rag-runtime
 server:
  port: 8321
  auth:
    provider_config:
      type: github_token
+telemetry:
+  enabled: true
+vector_stores:
+  default_provider_id: chromadb
+  default_embedding_model:
+    provider_id: sentence-transformers
+    model_id: nomic-ai/nomic-embed-text-v1.5
--- a/docs/docs/distributions/k8s/ui-k8s.yaml.template
+++ b/docs/docs/distributions/k8s/ui-k8s.yaml.template
@ -44,7 +44,7 @@ spec:

            # Navigate to the UI directory
            echo "Navigating to UI directory..."
-            cd /app/llama_stack/ui
+            cd /app/llama_stack_ui

            # Check if package.json exists
            if [ ! -f "package.json" ]; then
--- a/docs/docs/distributions/llama_stack_ui.mdx
+++ b/docs/docs/distributions/llama_stack_ui.mdx
@ -0,0 +1,109 @@
+---
+title: Llama Stack UI
+description: Web-based user interface for interacting with Llama Stack servers
+sidebar_label: Llama Stack UI
+sidebar_position: 8
+---
+
+# Llama Stack UI
+
+The Llama Stack UI is a web-based interface for interacting with Llama Stack servers. Built with Next.js and React, it provides a visual way to work with agents, manage resources, and view logs.
+
+## Features
+
+- **Logs & Monitoring**: View chat completions, agent responses, and vector store activity
+- **Vector Stores**: Create and manage vector databases for RAG (Retrieval-Augmented Generation) workflows
+- **Prompt Management**: Create and manage reusable prompts
+
+## Prerequisites
+
+You need a running Llama Stack server. The UI is a client that connects to the Llama Stack backend.
+
+If you don't have a Llama Stack server running yet, see the [Starting Llama Stack Server](../getting_started/starting_llama_stack_server.mdx) guide.
+
+## Running the UI
+
+### Option 1: Using npx (Recommended for Quick Start)
+
+The fastest way to get started is using `npx`:
+
+```bash
+npx llama-stack-ui
+```
+
+This will start the UI server on `http://localhost:8322` (default port).
+
+### Option 2: Using Docker
+
+Run the UI in a container:
+
+```bash
+docker run -p 8322:8322 llamastack/ui
+```
+
+Access the UI at `http://localhost:8322`.
+
+## Environment Variables
+
+The UI can be configured using the following environment variables:
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `LLAMA_STACK_BACKEND_URL` | URL of your Llama Stack server | `http://localhost:8321` |
+| `LLAMA_STACK_UI_PORT` | Port for the UI server | `8322` |
+
+If the Llama Stack server is running with authentication enabled, you can configure the UI to use it by setting the following environment variables:
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `NEXTAUTH_URL` | NextAuth URL for authentication | `http://localhost:8322` |
+| `GITHUB_CLIENT_ID` | GitHub OAuth client ID (optional, for authentication) | - |
+| `GITHUB_CLIENT_SECRET` | GitHub OAuth client secret (optional, for authentication) | - |
+
+### Setting Environment Variables
+
+#### For npx:
+
+```bash
+LLAMA_STACK_BACKEND_URL=http://localhost:8321 \
+LLAMA_STACK_UI_PORT=8080 \
+npx llama-stack-ui
+```
+
+#### For Docker:
+
+```bash
+docker run -p 8080:8080 \
+  -e LLAMA_STACK_BACKEND_URL=http://localhost:8321 \
+  -e LLAMA_STACK_UI_PORT=8080 \
+  llamastack/ui
+```
+
+## Using the UI
+
+### Managing Resources
+
+- **Vector Stores**: Create vector databases for RAG workflows, view stored documents and embeddings
+- **Prompts**: Create and manage reusable prompt templates
+- **Chat Completions**: View history of chat interactions
+- **Responses**: Browse detailed agent responses and tool calls
+
+## Development
+
+If you want to run the UI from source for development:
+
+```bash
+# From the project root
+cd src/llama_stack_ui
+
+# Install dependencies
+npm install
+
+# Set environment variables
+export LLAMA_STACK_BACKEND_URL=http://localhost:8321
+
+# Start the development server
+npm run dev
+```
+
+The development server will start on `http://localhost:8322` with hot reloading enabled.
--- a/docs/docs/distributions/ondevice_distro/android_sdk.md
+++ b/docs/docs/distributions/ondevice_distro/android_sdk.md
@ -59,7 +59,7 @@ Start a Llama Stack server on localhost. Here is an example of how you can do th
 uv venv starter --python 3.12
 source starter/bin/activate  # On Windows: starter\Scripts\activate
 pip install --no-cache llama-stack==0.2.2
-llama stack build --distro starter --image-type venv
+llama stack list-deps starter | xargs -L1 uv pip install
 export FIREWORKS_API_KEY=<SOME_KEY>
 llama stack run starter --port 5050
 ```
--- a/docs/docs/distributions/remote_hosted_distro/index.mdx
+++ b/docs/docs/distributions/remote_hosted_distro/index.mdx
@ -2,10 +2,10 @@

 Remote-Hosted distributions are available endpoints serving Llama Stack API that you can directly connect to.

-| Distribution | Endpoint | Inference | Agents | Memory | Safety | Telemetry |
+| Distribution | Endpoint | Inference | Agents | Memory | Safety |
 |-------------|----------|-----------|---------|---------|---------|------------|
-| Together | [https://llama-stack.together.ai](https://llama-stack.together.ai) | remote::together | meta-reference | remote::weaviate | meta-reference | meta-reference |
-| Fireworks | [https://llamastack-preview.fireworks.ai](https://llamastack-preview.fireworks.ai) | remote::fireworks | meta-reference | remote::weaviate | meta-reference | meta-reference |
+| Together | [https://llama-stack.together.ai](https://llama-stack.together.ai) | remote::together | meta-reference | remote::weaviate | meta-reference |
+| Fireworks | [https://llamastack-preview.fireworks.ai](https://llamastack-preview.fireworks.ai) | remote::fireworks | meta-reference | remote::weaviate | meta-reference |

 ## Connecting to Remote-Hosted Distributions

--- a/docs/docs/distributions/remote_hosted_distro/oci.md
+++ b/docs/docs/distributions/remote_hosted_distro/oci.md
@ -0,0 +1,143 @@
+---
+orphan: true
+---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
+# OCI Distribution
+
+The `llamastack/distribution-oci` distribution consists of the following provider configurations.
+
+| API | Provider(s) |
+|-----|-------------|
+| agents | `inline::meta-reference` |
+| datasetio | `remote::huggingface`, `inline::localfs` |
+| eval | `inline::meta-reference` |
+| files | `inline::localfs` |
+| inference | `remote::oci` |
+| safety | `inline::llama-guard` |
+| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
+
+
+### Environment Variables
+
+The following environment variables can be configured:
+
+- `OCI_AUTH_TYPE`: OCI authentication type (instance_principal or config_file) (default: `instance_principal`)
+- `OCI_REGION`: OCI region (e.g., us-ashburn-1, us-chicago-1, us-phoenix-1, eu-frankfurt-1) (default: ``)
+- `OCI_COMPARTMENT_OCID`: OCI compartment ID for the Generative AI service (default: ``)
+- `OCI_CONFIG_FILE_PATH`: OCI config file path (required if OCI_AUTH_TYPE is config_file) (default: `~/.oci/config`)
+- `OCI_CLI_PROFILE`: OCI CLI profile name to use from config file (default: `DEFAULT`)
+
+
+## Prerequisites
+### Oracle Cloud Infrastructure Setup
+
+Before using the OCI Generative AI distribution, ensure you have:
+
+1. **Oracle Cloud Infrastructure Account**: Sign up at [Oracle Cloud Infrastructure](https://cloud.oracle.com/)
+2. **Generative AI Service Access**: Enable the Generative AI service in your OCI tenancy
+3. **Compartment**: Create or identify a compartment where you'll deploy Generative AI models
+4. **Authentication**: Configure authentication using either:
+   - **Instance Principal** (recommended for cloud-hosted deployments)
+   - **API Key** (for on-premises or development environments)
+
+### Authentication Methods
+
+#### Instance Principal Authentication (Recommended)
+Instance Principal authentication allows OCI resources to authenticate using the identity of the compute instance they're running on. This is the most secure method for production deployments.
+
+Requirements:
+- Instance must be running in an Oracle Cloud Infrastructure compartment
+- Instance must have appropriate IAM policies to access Generative AI services
+
+#### API Key Authentication
+For development or on-premises deployments, follow [this doc](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/apisigningkey.htm) to learn how to create your API signing key for your config file.
+
+### Required IAM Policies
+
+Ensure your OCI user or instance has the following policy statements:
+
+```
+Allow group <group_name> to use generative-ai-inference-endpoints in compartment <compartment_name>
+Allow group <group_name> to manage generative-ai-inference-endpoints in compartment <compartment_name>
+```
+
+## Supported Services
+
+### Inference: OCI Generative AI
+Oracle Cloud Infrastructure Generative AI provides access to high-performance AI models through OCI's Platform-as-a-Service offering. The service supports:
+
+- **Chat Completions**: Conversational AI with context awareness
+- **Text Generation**: Complete prompts and generate text content
+
+#### Available Models
+Common OCI Generative AI models include access to Meta, Cohere, OpenAI, Grok, and more models.
+
+### Safety: Llama Guard
+For content safety and moderation, this distribution uses Meta's LlamaGuard model through the OCI Generative AI service to provide:
+- Content filtering and moderation
+- Policy compliance checking
+- Harmful content detection
+
+### Vector Storage: Multiple Options
+The distribution supports several vector storage providers:
+- **FAISS**: Local in-memory vector search
+- **ChromaDB**: Distributed vector database
+- **PGVector**: PostgreSQL with vector extensions
+
+### Additional Services
+- **Dataset I/O**: Local filesystem and Hugging Face integration
+- **Tool Runtime**: Web search (Brave, Tavily) and RAG capabilities
+- **Evaluation**: Meta reference evaluation framework
+
+## Running Llama Stack with OCI
+
+You can run the OCI distribution via Docker or local virtual environment.
+
+### Via venv
+
+If you've set up your local development environment, you can also build the image using your local virtual environment.
+
+```bash
+OCI_AUTH=$OCI_AUTH_TYPE OCI_REGION=$OCI_REGION OCI_COMPARTMENT_OCID=$OCI_COMPARTMENT_OCID llama stack run --port 8321 oci
+```
+
+### Configuration Examples
+
+#### Using Instance Principal (Recommended for Production)
+```bash
+export OCI_AUTH_TYPE=instance_principal
+export OCI_REGION=us-chicago-1
+export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..<your-compartment-id>
+```
+
+#### Using API Key Authentication (Development)
+```bash
+export OCI_AUTH_TYPE=config_file
+export OCI_CONFIG_FILE_PATH=~/.oci/config
+export OCI_CLI_PROFILE=DEFAULT
+export OCI_REGION=us-chicago-1
+export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..your-compartment-id
+```
+
+## Regional Endpoints
+
+OCI Generative AI is available in multiple regions. The service automatically routes to the appropriate regional endpoint based on your configuration. For a full list of regional model availability, visit:
+
+https://docs.oracle.com/en-us/iaas/Content/generative-ai/overview.htm#regions
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Authentication Errors**: Verify your OCI credentials and IAM policies
+2. **Model Not Found**: Ensure the model OCID is correct and the model is available in your region
+3. **Permission Denied**: Check compartment permissions and Generative AI service access
+4. **Region Unavailable**: Verify the specified region supports Generative AI services
+
+### Getting Help
+
+For additional support:
+- [OCI Generative AI Documentation](https://docs.oracle.com/en-us/iaas/Content/generative-ai/home.htm)
+- [Llama Stack Issues](https://github.com/meta-llama/llama-stack/issues)
--- a/docs/docs/distributions/remote_hosted_distro/watsonx.md
+++ b/docs/docs/distributions/remote_hosted_distro/watsonx.md
@ -21,7 +21,6 @@ The `llamastack/distribution-watsonx` distribution consists of the following pro
 | inference | `remote::watsonx`, `inline::sentence-transformers` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
-| telemetry | `inline::meta-reference` |
 | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss` |

--- a/docs/docs/distributions/self_hosted_distro/dell-tgi.md
+++ b/docs/docs/distributions/self_hosted_distro/dell-tgi.md
@ -13,9 +13,9 @@ self
 The `llamastack/distribution-tgi` distribution consists of the following provider configurations.


-| **API**         	| **Inference** 	| **Agents**     	| **Memory**                                       	| **Safety**     	| **Telemetry**  	|
-|-----------------	|---------------	|----------------	|--------------------------------------------------	|----------------	|----------------	|
-| **Provider(s)** 	| remote::tgi   	| meta-reference 	| meta-reference, remote::pgvector, remote::chroma 	| meta-reference 	| meta-reference 	|
+| **API**         	| **Inference** 	| **Agents**     	| **Memory**                                       	| **Safety**     	|
+|-----------------	|---------------	|----------------	|--------------------------------------------------	|----------------	|
+| **Provider(s)** 	| remote::tgi   	| meta-reference 	| meta-reference, remote::pgvector, remote::chroma 	| meta-reference 	|


 The only difference vs. the `tgi` distribution is that it runs the Dell-TGI server for inference.
--- a/docs/docs/distributions/self_hosted_distro/dell.md
+++ b/docs/docs/distributions/self_hosted_distro/dell.md
@ -22,7 +22,6 @@ The `llamastack/distribution-dell` distribution consists of the following provid
 | inference | `remote::tgi`, `inline::sentence-transformers` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
-| telemetry | `inline::meta-reference` |
 | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |

@ -166,10 +165,10 @@ docker run \

 ### Via venv

-Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
+Install the distribution dependencies before launching:

 ```bash
-llama stack build --distro dell --image-type venv
+llama stack list-deps dell | xargs -L1 uv pip install
 INFERENCE_MODEL=$INFERENCE_MODEL \
 DEH_URL=$DEH_URL \
 CHROMA_URL=$CHROMA_URL \
--- a/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
@ -21,7 +21,6 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo
 | inference | `inline::meta-reference` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
-| telemetry | `inline::meta-reference` |
 | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |

@ -41,31 +40,7 @@ The following environment variables can be configured:

 ## Prerequisite: Downloading Models

-Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
-
-```
-$ llama model list --downloaded
-┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
-┃ Model                                   ┃ Size     ┃ Modified Time       ┃
-┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
-│ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
-└─────────────────────────────────────────┴──────────┴─────────────────────┘
+Please check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models using the Hugging Face CLI.
 ```

 ## Running the Distribution
@ -104,12 +79,39 @@ docker run \
  --port $LLAMA_STACK_PORT
 ```

-### Via venv
+### Via Docker with Custom Run Configuration

-Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
+You can also run the Docker container with a custom run configuration file by mounting it into the container:

 ```bash
-llama stack build --distro meta-reference-gpu --image-type venv
+# Set the path to your custom run.yaml file
+CUSTOM_RUN_CONFIG=/path/to/your/custom-run.yaml
+LLAMA_STACK_PORT=8321
+
+docker run \
+  -it \
+  --pull always \
+  --gpu all \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  -v $CUSTOM_RUN_CONFIG:/app/custom-run.yaml \
+  -e RUN_CONFIG_PATH=/app/custom-run.yaml \
+  llamastack/distribution-meta-reference-gpu \
+  --port $LLAMA_STACK_PORT
+```
+
+**Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
+
+Available run configurations for this distribution:
+- `run.yaml`
+- `run-with-safety.yaml`
+
+### Via venv
+
+Make sure you have the Llama Stack CLI available.
+
+```bash
+llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
 INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
 llama stack run distributions/meta-reference-gpu/run.yaml \
  --port 8321
--- a/docs/docs/distributions/self_hosted_distro/nvidia.md
+++ b/docs/docs/distributions/self_hosted_distro/nvidia.md
@ -16,7 +16,6 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 | post_training | `remote::nvidia` |
 | safety | `remote::nvidia` |
 | scoring | `inline::basic` |
-| telemetry | `inline::meta-reference` |
 | tool_runtime | `inline::rag-runtime` |
 | vector_io | `inline::faiss` |

@ -128,20 +127,46 @@ docker run \
  -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
+  -v ~/.llama:/root/.llama \
  -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
  llamastack/distribution-nvidia \
-  --config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT
 ```

+### Via Docker with Custom Run Configuration
+
+You can also run the Docker container with a custom run configuration file by mounting it into the container:
+
+```bash
+# Set the path to your custom run.yaml file
+CUSTOM_RUN_CONFIG=/path/to/your/custom-run.yaml
+LLAMA_STACK_PORT=8321
+
+docker run \
+  -it \
+  --pull always \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  -v $CUSTOM_RUN_CONFIG:/app/custom-run.yaml \
+  -e RUN_CONFIG_PATH=/app/custom-run.yaml \
+  -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
+  llamastack/distribution-nvidia \
+  --port $LLAMA_STACK_PORT
+```
+
+**Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
+
+Available run configurations for this distribution:
+- `run.yaml`
+- `run-with-safety.yaml`
+
 ### Via venv

-If you've set up your local development environment, you can also build the image using your local virtual environment.
+If you've set up your local development environment, you can also install the distribution dependencies using your local virtual environment.

 ```bash
 INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
-llama stack build --distro nvidia --image-type venv
+llama stack list-deps nvidia | xargs -L1 uv pip install
 NVIDIA_API_KEY=$NVIDIA_API_KEY \
 INFERENCE_MODEL=$INFERENCE_MODEL \
 llama stack run ./run.yaml \
--- a/docs/docs/distributions/self_hosted_distro/passthrough.md
+++ b/docs/docs/distributions/self_hosted_distro/passthrough.md
@ -21,7 +21,6 @@ The `llamastack/distribution-passthrough` distribution consists of the following
 | inference | `remote::passthrough`, `inline::sentence-transformers` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
-| telemetry | `inline::meta-reference` |
 | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |

--- a/docs/docs/distributions/self_hosted_distro/starter.md
+++ b/docs/docs/distributions/self_hosted_distro/starter.md
@ -26,7 +26,6 @@ The starter distribution consists of the following provider configurations:
 | inference | `remote::openai`, `remote::fireworks`, `remote::together`, `remote::ollama`, `remote::anthropic`, `remote::gemini`, `remote::groq`, `remote::sambanova`, `remote::vllm`, `remote::tgi`, `remote::cerebras`, `remote::llama-openai-compat`, `remote::nvidia`, `remote::hf::serverless`, `remote::hf::endpoint`, `inline::sentence-transformers` |
 | safety | `inline::llama-guard`                                                                                                                                                                                                                                                                                                                          |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust`                                                                                                                                                                                                                                                                                  |
-| telemetry | `inline::meta-reference`                                                                                                                                                                                                                                                                                                                       |
 | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`                                                                                                                                                                                                                                       |
 | vector_io | `inline::faiss`, `inline::sqlite-vec`, `inline::milvus`, `remote::chromadb`, `remote::pgvector`                                                                                                                                                                                                                                                 |

@ -119,7 +118,7 @@ The following environment variables can be configured:

 ### Telemetry Configuration
 - `OTEL_SERVICE_NAME`: OpenTelemetry service name
- `TELEMETRY_SINKS`: Telemetry sinks (default: `console,sqlite`)
+- `OTEL_EXPORTER_OTLP_ENDPOINT`: OpenTelemetry collector endpoint URL

 ## Enabling Providers

@ -164,12 +163,53 @@ docker run \
  --port $LLAMA_STACK_PORT
 ```

-### Via venv
+The container will run the distribution with a SQLite store by default. This store is used for the following components:
+
+- Metadata store: store metadata about the models, providers, etc.
+- Inference store: collect of responses from the inference provider
+- Agents store: store agent configurations (sessions, turns, etc.)
+- Agents Responses store: store responses from the agents
+
+However, you can use PostgreSQL instead by running the `starter::run-with-postgres-store.yaml` configuration:
+
+```bash
+docker run \
+  -it \
+  --pull always \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -e OPENAI_API_KEY=your_openai_key \
+  -e FIREWORKS_API_KEY=your_fireworks_key \
+  -e TOGETHER_API_KEY=your_together_key \
+  -e POSTGRES_HOST=your_postgres_host \
+  -e POSTGRES_PORT=your_postgres_port \
+  -e POSTGRES_DB=your_postgres_db \
+  -e POSTGRES_USER=your_postgres_user \
+  -e POSTGRES_PASSWORD=your_postgres_password \
+  llamastack/distribution-starter \
+  starter::run-with-postgres-store.yaml
+```
+
+Postgres environment variables:
+
+- `POSTGRES_HOST`: Postgres host (default: `localhost`)
+- `POSTGRES_PORT`: Postgres port (default: `5432`)
+- `POSTGRES_DB`: Postgres database name (default: `llamastack`)
+- `POSTGRES_USER`: Postgres username (default: `llamastack`)
+- `POSTGRES_PASSWORD`: Postgres password (default: `llamastack`)
+
+### Via Conda or venv

 Ensure you have configured the starter distribution using the environment variables explained above.

 ```bash
-uv run --with llama-stack llama stack build --distro starter --image-type venv --run
+# Install dependencies for the starter distribution
+uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
+
+# Run the server (with SQLite - default)
+uv run --with llama-stack llama stack run starter
+
+# Or run with PostgreSQL
+uv run --with llama-stack llama stack run starter::run-with-postgres-store.yaml
 ```

 ## Example Usage
@ -216,7 +256,6 @@ The starter distribution uses SQLite for local storage of various components:
 - **Files metadata**: `~/.llama/distributions/starter/files_metadata.db`
 - **Agents store**: `~/.llama/distributions/starter/agents_store.db`
 - **Responses store**: `~/.llama/distributions/starter/responses_store.db`
- **Trace store**: `~/.llama/distributions/starter/trace_store.db`
 - **Evaluation store**: `~/.llama/distributions/starter/meta_reference_eval.db`
 - **Dataset I/O stores**: Various HuggingFace and local filesystem stores

--- a/docs/docs/distributions/starting_llama_stack_server.mdx
+++ b/docs/docs/distributions/starting_llama_stack_server.mdx
@ -23,6 +23,17 @@ Another simple way to start interacting with Llama Stack is to just spin up a co
 If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](../deploying/kubernetes_deployment) for more details.


+## Configure logging
+
+Control log output via environment variables before starting the server.
+
+- `LLAMA_STACK_LOGGING` sets per-component levels, e.g. `LLAMA_STACK_LOGGING=server=debug;core=info`.
+- Supported categories: `all`, `core`, `server`, `router`, `inference`, `agents`, `safety`, `eval`, `tools`, `client`.
+- Levels: `debug`, `info`, `warning`, `error`, `critical` (default is `info`). Use `all=<level>` to apply globally.
+- `LLAMA_STACK_LOG_FILE=/path/to/log` mirrors logs to a file while still printing to stdout.
+
+Export these variables prior to running `llama stack run`, launching a container, or starting the server through any other pathway.
+
 ```{toctree}
 :maxdepth: 1
 :hidden:
--- a/docs/docs/getting_started/demo_script.py
+++ b/docs/docs/getting_started/demo_script.py
@ -4,65 +4,24 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient

-vector_db_id = "my_demo_vector_db"
-client = LlamaStackClient(base_url="http://localhost:8321")
+import io, requests
+from openai import OpenAI

-models = client.models.list()
+url="https://www.paulgraham.com/greatwork.html"
+client = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")

-# Select the first LLM and first embedding models
-model_id = next(m for m in models if m.model_type == "llm").identifier
-embedding_model_id = (
-    em := next(m for m in models if m.model_type == "embedding")
-).identifier
-embedding_dimension = em.metadata["embedding_dimension"]
+vs = client.vector_stores.create()
+response = requests.get(url)
+pseudo_file = io.BytesIO(str(response.content).encode('utf-8'))
+uploaded_file = client.files.create(file=(url, pseudo_file, "text/html"), purpose="assistants")
+client.vector_stores.files.create(vector_store_id=vs.id, file_id=uploaded_file.id)

-vector_db = client.vector_dbs.register(
-    vector_db_id=vector_db_id,
-    embedding_model=embedding_model_id,
-    embedding_dimension=embedding_dimension,
-    provider_id="faiss",
-)
-vector_db_id = vector_db.identifier
-source = "https://www.paulgraham.com/greatwork.html"
-print("rag_tool> Ingesting document:", source)
-document = RAGDocument(
-    document_id="document_1",
-    content=source,
-    mime_type="text/html",
-    metadata={},
-)
-client.tool_runtime.rag_tool.insert(
-    documents=[document],
-    vector_db_id=vector_db_id,
-    chunk_size_in_tokens=100,
-)
-agent = Agent(
-    client,
-    model=model_id,
-    instructions="You are a helpful assistant",
-    tools=[
-        {
-            "name": "builtin::rag/knowledge_search",
-            "args": {"vector_db_ids": [vector_db_id]},
-        }
-    ],
+resp = client.responses.create(
+    model="openai/gpt-4o",
+    input="How do you do great work? Use the existing knowledge_search tool.",
+    tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
+    include=["file_search_call.results"],
 )

-prompt = "How do you do great work?"
-print("prompt>", prompt)
-
-use_stream = True
-response = agent.create_turn(
-    messages=[{"role": "user", "content": prompt}],
-    session_id=agent.create_session("rag_session"),
-    stream=use_stream,
-)
-
-# Only call `AgentEventLogger().log(response)` for streaming responses.
-if use_stream:
-    for log in AgentEventLogger().log(response):
-        log.print()
-else:
-    print(response)
+print(resp)
--- a/docs/docs/getting_started/detailed_tutorial.mdx
+++ b/docs/docs/getting_started/detailed_tutorial.mdx
@ -58,15 +58,19 @@ Llama Stack is a server that exposes multiple APIs, you connect with it using th

 <Tabs>
 <TabItem value="venv" label="Using venv">
-You can use Python to build and run the Llama Stack server, which is useful for testing and development.
+You can use Python to install dependencies and run the Llama Stack server, which is useful for testing and development.

 Llama Stack uses a [YAML configuration file](../distributions/configuration) to specify the stack setup,
 which defines the providers and their settings. The generated configuration serves as a starting point that you can [customize for your specific needs](../distributions/customizing_run_yaml).
-Now let's build and run the Llama Stack config for Ollama.
+Now let's install dependencies and run the Llama Stack config for Ollama.
 We use `starter` as template. By default all providers are disabled, this requires enable ollama by passing environment variables.

 ```bash
-llama stack build --distro starter --image-type venv --run
+# Install dependencies for the starter distribution
+uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
+
+# Run the server
+llama stack run starter
 ```
 </TabItem>
 <TabItem value="container" label="Using a Container">
@ -140,7 +144,7 @@ source .venv/bin/activate
 ```bash
 uv venv client --python 3.12
 source client/bin/activate
-pip install llama-stack-client
+uv pip install llama-stack-client
 ```
 </TabItem>
 </Tabs>
@ -164,7 +168,7 @@ Available Models
 ┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┓
 ┃ model_type      ┃ identifier                          ┃ provider_resource_id                ┃ metadata                                  ┃ provider_id           ┃
 ┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━┩
-│ embedding       │ ollama/all-minilm:l6-v2             │ all-minilm:l6-v2                    │ {'embedding_dimension': 384.0}            │ ollama                │
+│ embedding       │ ollama/nomic-embed-text:v1.5        │ nomic-embed-text:v1.5               │ {'embedding_dimension': 768.0}            │ ollama                │
 ├─────────────────┼─────────────────────────────────────┼─────────────────────────────────────┼───────────────────────────────────────────┼───────────────────────┤
 │ ...             │ ...                                 │ ...                                 │                                           │ ...                   │
 ├─────────────────┼─────────────────────────────────────┼─────────────────────────────────────┼───────────────────────────────────────────┼───────────────────────┤
@ -235,8 +239,13 @@ client = LlamaStackClient(base_url="http://localhost:8321")
 models = client.models.list()

 # Select the first LLM
-llm = next(m for m in models if m.model_type == "llm" and m.provider_id == "ollama")
-model_id = llm.identifier
+llm = next(
+    m for m in models
+    if m.custom_metadata
+    and m.custom_metadata.get("model_type") == "llm"
+    and m.custom_metadata.get("provider_id") == "ollama"
+)
+model_id = llm.id

 print("Model:", model_id)

@ -275,8 +284,13 @@ import uuid
 client = LlamaStackClient(base_url=f"http://localhost:8321")

 models = client.models.list()
-llm = next(m for m in models if m.model_type == "llm" and m.provider_id == "ollama")
-model_id = llm.identifier
+llm = next(
+    m for m in models
+    if m.custom_metadata
+    and m.custom_metadata.get("model_type") == "llm"
+    and m.custom_metadata.get("provider_id") == "ollama"
+)
+model_id = llm.id

 agent = Agent(client, model=model_id, instructions="You are a helpful assistant.")

@ -304,7 +318,7 @@ stream = agent.create_turn(
 for event in AgentEventLogger().log(stream):
    event.print()
 ```
-### ii. Run the Script
+#### ii. Run the Script
 Let's run the script using `uv`
 ```bash
 uv run python agent.py
@ -446,8 +460,11 @@ import uuid
 client = LlamaStackClient(base_url="http://localhost:8321")

 # Create a vector database instance
-embed_lm = next(m for m in client.models.list() if m.model_type == "embedding")
-embedding_model = embed_lm.identifier
+embed_lm = next(
+    m for m in client.models.list()
+    if m.custom_metadata and m.custom_metadata.get("model_type") == "embedding"
+)
+embedding_model = embed_lm.id
 vector_db_id = f"v{uuid.uuid4().hex}"
 # The VectorDB API is deprecated; the server now returns its own authoritative ID.
 # We capture the correct ID from the response's .identifier attribute.
@ -485,9 +502,11 @@ client.tool_runtime.rag_tool.insert(
 llm = next(
    m
    for m in client.models.list()
-    if m.model_type == "llm" and m.provider_id == "ollama"
+    if m.custom_metadata
+    and m.custom_metadata.get("model_type") == "llm"
+    and m.custom_metadata.get("provider_id") == "ollama"
 )
-model = llm.identifier
+model = llm.id

 # Create the RAG agent
 rag_agent = Agent(
--- a/docs/docs/getting_started/quickstart.mdx
+++ b/docs/docs/getting_started/quickstart.mdx
@ -24,111 +24,44 @@ ollama run llama3.2:3b --keepalive 60m

 #### Step 2: Run the Llama Stack server

-We will use `uv` to run the Llama Stack server.
+```python file=./demo_script.py title="demo_script.py"
+```
+
+We will use `uv` to install dependencies and run the Llama Stack server.
 ```bash
-OLLAMA_URL=http://localhost:11434 \
-  uv run --with llama-stack llama stack build --distro starter --image-type venv --run
+# Install dependencies for the starter distribution
+uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
+
+# Run the server
+OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter
 ```
 #### Step 3: Run the demo
 Now open up a new terminal and copy the following script into a file named `demo_script.py`.

-```python title="demo_script.py"
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient
-
-vector_db_id = "my_demo_vector_db"
-client = LlamaStackClient(base_url="http://localhost:8321")
-
-models = client.models.list()
-
-# Select the first LLM and first embedding models
-model_id = next(m for m in models if m.model_type == "llm").identifier
-embedding_model_id = (
-    em := next(m for m in models if m.model_type == "embedding")
-).identifier
-embedding_dimension = em.metadata["embedding_dimension"]
-
-vector_db = client.vector_dbs.register(
-    vector_db_id=vector_db_id,
-    embedding_model=embedding_model_id,
-    embedding_dimension=embedding_dimension,
-    provider_id="faiss",
-)
-vector_db_id = vector_db.identifier
-source = "https://www.paulgraham.com/greatwork.html"
-print("rag_tool> Ingesting document:", source)
-document = RAGDocument(
-    document_id="document_1",
-    content=source,
-    mime_type="text/html",
-    metadata={},
-)
-client.tool_runtime.rag_tool.insert(
-    documents=[document],
-    vector_db_id=vector_db_id,
-    chunk_size_in_tokens=100,
-)
-agent = Agent(
-    client,
-    model=model_id,
-    instructions="You are a helpful assistant",
-    tools=[
-        {
-            "name": "builtin::rag/knowledge_search",
-            "args": {"vector_db_ids": [vector_db_id]},
-        }
-    ],
-)
-
-prompt = "How do you do great work?"
-print("prompt>", prompt)
-
-use_stream = True
-response = agent.create_turn(
-    messages=[{"role": "user", "content": prompt}],
-    session_id=agent.create_session("rag_session"),
-    stream=use_stream,
-)
-
-# Only call `AgentEventLogger().log(response)` for streaming responses.
-if use_stream:
-    for log in AgentEventLogger().log(response):
-        log.print()
-else:
-    print(response)
-```
 We will use `uv` to run the script
 ```
 uv run --with llama-stack-client,fire,requests demo_script.py
 ```
 And you should see output like below.
+```python
+>print(resp.output[1].content[0].text)
+To do great work, consider the following principles:
+
+1. **Follow Your Interests**: Engage in work that genuinely excites you. If you find an area intriguing, pursue it without being overly concerned about external pressures or norms. You should create things that you would want for yourself, as this often aligns with what others in your circle might want too.
+
+2. **Work Hard on Ambitious Projects**: Ambition is vital, but it should be tempered by genuine interest. Instead of detailed planning for the future, focus on exciting projects that keep your options open. This approach, known as "staying upwind," allows for adaptability and can lead to unforeseen achievements.
+
+3. **Choose Quality Colleagues**: Collaborating with talented colleagues can significantly affect your own work. Seek out individuals who offer surprising insights and whom you admire. The presence of good colleagues can elevate the quality of your work and inspire you.
+
+4. **Maintain High Morale**: Your attitude towards work and life affects your performance. Cultivating optimism and viewing yourself as lucky rather than victimized can boost your productivity. It’s essential to care for your physical health as well since it directly impacts your mental faculties and morale.
+
+5. **Be Consistent**: Great work often comes from cumulative effort. Daily progress, even in small amounts, can result in substantial achievements over time. Emphasize consistency and make the work engaging, as this reduces the perceived burden of hard labor.
+
+6. **Embrace Curiosity**: Curiosity is a driving force that can guide you in selecting fields of interest, pushing you to explore uncharted territories. Allow it to shape your work and continually seek knowledge and insights.
+
+By focusing on these aspects, you can create an environment conducive to great work and personal fulfillment.
 ```
-rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html

-prompt> How do you do great work?
-
-inference> [knowledge_search(query="What is the key to doing great work")]
-
-tool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}
-
-tool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 2:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 3:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 4:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 5:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text='END of knowledge_search tool results.\n', type='text')]
-
-inference> Based on the search results, it seems that doing great work means doing something important so well that you expand people's ideas of what's possible. However, there is no clear threshold for importance, and it can be difficult to judge at the time.
-
-To further clarify, I would suggest that doing great work involves:
-
-* Completing tasks with high quality and attention to detail
-* Expanding on existing knowledge or ideas
-* Making a positive impact on others through your work
-* Striving for excellence and continuous improvement
-
-Ultimately, great work is about making a meaningful contribution and leaving a lasting impression.
-```
 Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳

 :::tip HuggingFace access
--- a/docs/docs/index.mdx
+++ b/docs/docs/index.mdx
@ -29,7 +29,7 @@ Llama Stack is now available! See the [release notes](https://github.com/llamast

 Llama Stack defines and standardizes the core building blocks needed to bring generative AI applications to market. It provides a unified set of APIs with implementations from leading service providers, enabling seamless transitions between development and production environments. More specifically, it provides:

- **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
+- **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals.
 - **Plugin architecture** to support the rich ecosystem of implementations of the different APIs in different environments like local development, on-premises, cloud, and mobile.
 - **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment
 - **Multiple developer interfaces** like CLI and SDKs for Python, Node, iOS, and Android
--- a/docs/docs/providers/agents/index.mdx
+++ b/docs/docs/providers/agents/index.mdx
@ -1,7 +1,8 @@
 ---
-description: "Agents
+description: |
+  Agents

-    APIs for creating and interacting with agentic systems."
+      APIs for creating and interacting with agentic systems.
 sidebar_label: Agents
 title: Agents
 ---
--- a/docs/docs/providers/agents/inline_meta-reference.mdx
+++ b/docs/docs/providers/agents/inline_meta-reference.mdx
@ -14,16 +14,18 @@ Meta's reference implementation of an agent system that can use tools, access ve

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `persistence_store` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
-| `responses_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite |  |
+| `persistence` | `AgentPersistenceConfig` | No |  |  |

 ## Sample Configuration

 ```yaml
-persistence_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/agents_store.db
-responses_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/responses_store.db
+persistence:
+  agent_state:
+    namespace: agents
+    backend: kv_default
+  responses:
+    table_name: responses
+    backend: sql_default
+    max_write_queue_size: 10000
+    num_writers: 4
 ```
--- a/docs/docs/providers/batches/index.mdx
+++ b/docs/docs/providers/batches/index.mdx
@ -1,5 +1,6 @@
 ---
-description: "The Batches API enables efficient processing of multiple requests in a single operation,
+description: |
+  The Batches API enables efficient processing of multiple requests in a single operation,
      particularly useful for processing large datasets, batch evaluation workflows, and
      cost-effective inference at scale.

@ -8,7 +9,7 @@ description: "The Batches API enables efficient processing of multiple requests
      This API provides the following extensions:
       - idempotent batch creation

-    Note: This API is currently under active development and may undergo changes."
+      Note: This API is currently under active development and may undergo changes.
 sidebar_label: Batches
 title: Batches
 ---
--- a/docs/docs/providers/batches/inline_reference.mdx
+++ b/docs/docs/providers/batches/inline_reference.mdx
@ -14,14 +14,14 @@ Reference implementation of batches API with KVStore persistence.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Configuration for the key-value store backend. |
-| `max_concurrent_batches` | `<class 'int'>` | No | 1 | Maximum number of concurrent batches to process simultaneously. |
-| `max_concurrent_requests_per_batch` | `<class 'int'>` | No | 10 | Maximum number of concurrent requests to process per batch. |
+| `kvstore` | `KVStoreReference` | No |  | Configuration for the key-value store backend. |
+| `max_concurrent_batches` | `int` | No | 1 | Maximum number of concurrent batches to process simultaneously. |
+| `max_concurrent_requests_per_batch` | `int` | No | 10 | Maximum number of concurrent requests to process per batch. |

 ## Sample Configuration

 ```yaml
 kvstore:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/batches.db
+  namespace: batches
+  backend: kv_default
 ```
--- a/docs/docs/providers/datasetio/inline_localfs.mdx
+++ b/docs/docs/providers/datasetio/inline_localfs.mdx
@ -14,12 +14,12 @@ Local filesystem-based dataset I/O provider for reading and writing datasets to

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
+| `kvstore` | `KVStoreReference` | No |  |  |

 ## Sample Configuration

 ```yaml
 kvstore:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/localfs_datasetio.db
+  namespace: datasetio::localfs
+  backend: kv_default
 ```
--- a/docs/docs/providers/datasetio/remote_huggingface.mdx
+++ b/docs/docs/providers/datasetio/remote_huggingface.mdx
@ -14,12 +14,12 @@ HuggingFace datasets provider for accessing and managing datasets from the Huggi

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
+| `kvstore` | `KVStoreReference` | No |  |  |

 ## Sample Configuration

 ```yaml
 kvstore:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/huggingface_datasetio.db
+  namespace: datasetio::huggingface
+  backend: kv_default
 ```
--- a/docs/docs/providers/datasetio/remote_nvidia.mdx
+++ b/docs/docs/providers/datasetio/remote_nvidia.mdx
@ -17,7 +17,7 @@ NVIDIA's dataset I/O provider for accessing datasets from NVIDIA's data platform
 | `api_key` | `str \| None` | No |  | The NVIDIA API key. |
 | `dataset_namespace` | `str \| None` | No | default | The NVIDIA dataset namespace. |
 | `project_id` | `str \| None` | No | test-project | The NVIDIA project ID. |
-| `datasets_url` | `<class 'str'>` | No | http://nemo.test | Base URL for the NeMo Dataset API |
+| `datasets_url` | `str` | No | http://nemo.test | Base URL for the NeMo Dataset API |

 ## Sample Configuration

--- a/docs/docs/providers/eval/index.mdx
+++ b/docs/docs/providers/eval/index.mdx
@ -1,5 +1,8 @@
 ---
-description: "Llama Stack Evaluation API for running evaluations on model and agent candidates."
+description: |
+  Evaluations
+
+      Llama Stack Evaluation API for running evaluations on model and agent candidates.
 sidebar_label: Eval
 title: Eval
 ---
@ -8,6 +11,8 @@ title: Eval

 ## Overview

-Llama Stack Evaluation API for running evaluations on model and agent candidates.
+Evaluations
+
+    Llama Stack Evaluation API for running evaluations on model and agent candidates.

 This section contains documentation for all available providers for the **eval** API.
--- a/docs/docs/providers/eval/inline_meta-reference.mdx
+++ b/docs/docs/providers/eval/inline_meta-reference.mdx
@ -14,12 +14,12 @@ Meta's reference implementation of evaluation tasks with support for multiple la

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
+| `kvstore` | `KVStoreReference` | No |  |  |

 ## Sample Configuration

 ```yaml
 kvstore:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/meta_reference_eval.db
+  namespace: eval
+  backend: kv_default
 ```
--- a/docs/docs/providers/eval/remote_nvidia.mdx
+++ b/docs/docs/providers/eval/remote_nvidia.mdx
@ -14,7 +14,7 @@ NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `evaluator_url` | `<class 'str'>` | No | http://0.0.0.0:7331 | The url for accessing the evaluator service |
+| `evaluator_url` | `str` | No | http://0.0.0.0:7331 | The url for accessing the evaluator service |

 ## Sample Configuration

--- a/docs/docs/providers/external/external-providers-guide.mdx
+++ b/docs/docs/providers/external/external-providers-guide.mdx
@ -80,7 +80,7 @@ container_image: custom-vector-store:latest  # optional
 All providers must contain a `get_provider_spec` function in their `provider` module. This is a standardized structure that Llama Stack expects and is necessary for getting things such as the config class. The `get_provider_spec` method returns a structure identical to the `adapter`. An example function may look like:

 ```python
-from llama_stack.providers.datatypes import (
+from llama_stack_api.providers.datatypes import (
    ProviderSpec,
    Api,
    RemoteProviderSpec,
@ -240,6 +240,6 @@ additional_pip_packages:
 - sqlalchemy[asyncio]
 ```

-No other steps are required other than `llama stack build` and `llama stack run`. The build process will use `module` to install all of the provider dependencies, retrieve the spec, etc.
+No other steps are required beyond installing dependencies with `llama stack list-deps <distro> | xargs -L1 uv pip install` and then running `llama stack run`. The CLI will use `module` to install the provider dependencies, retrieve the spec, etc.

 The provider will now be available in Llama Stack with the type `remote::ramalama`.
--- a/docs/docs/providers/files/files.mdx
+++ b/docs/docs/providers/files/files.mdx
@ -0,0 +1,290 @@
+---
+sidebar_label: Files
+title: Files
+---
+
+## Overview
+
+The Files API provides file management capabilities for Llama Stack. It allows you to upload, store, retrieve, and manage files that can be used across various endpoints in your application.
+
+## Features
+
+- **File Upload**: Upload files with metadata and purpose classification
+- **File Management**: List, retrieve, and delete files
+- **Content Retrieval**: Access raw file content for processing
+- **API Compatibility**: Full compatibility with OpenAI Files API endpoints
+- **Flexible Storage**: Support for local filesystem and cloud storage backends
+
+## API Endpoints
+
+### Upload File
+
+**POST** `/v1/openai/v1/files`
+
+Upload a file that can be used across various endpoints.
+
+**Request Body:**
+- `file`: The file object to be uploaded (multipart form data)
+- `purpose`: The intended purpose of the uploaded file
+
+**Supported Purposes:**
+- `batch`: Files for batch operations
+
+**Response:**
+```json
+{
+  "id": "file-abc123",
+  "object": "file",
+  "bytes": 140,
+  "created_at": 1613779121,
+  "filename": "mydata.jsonl",
+  "purpose": "batch"
+}
+```
+
+**Example:**
+```python
+import requests
+
+with open("data.jsonl", "rb") as f:
+    files = {"file": f}
+    data = {"purpose": "batch"}
+    response = requests.post(
+        "http://localhost:8000/v1/openai/v1/files", files=files, data=data
+      )
+    file_info = response.json()
+```
+
+### List Files
+
+**GET** `/v1/openai/v1/files`
+
+Returns a list of files that belong to the user's organization.
+
+**Query Parameters:**
+- `after` (optional): A cursor for pagination
+- `limit` (optional): Limit on number of objects (1-10,000, default: 10,000)
+- `order` (optional): Sort order by created_at timestamp (`asc` or `desc`, default: `desc`)
+- `purpose` (optional): Filter files by purpose
+
+**Response:**
+```json
+{
+  "object": "list",
+  "data": [
+    {
+      "id": "file-abc123",
+      "object": "file",
+      "bytes": 140,
+      "created_at": 1613779121,
+      "filename": "mydata.jsonl",
+      "purpose": "fine-tune"
+    }
+  ],
+  "has_more": false
+}
+```
+
+**Example:**
+```python
+import requests
+
+# List all files
+response = requests.get("http://localhost:8000/v1/openai/v1/files")
+files = response.json()
+
+# List files with pagination
+response = requests.get(
+    "http://localhost:8000/v1/openAi/v1/files",
+    params={"limit": 10, "after": "file-abc123"},
+)
+files = response.json()
+
+# Filter by purpose
+response = requests.get(
+    "http://localhost:8000/v1/openAi/v1/files", params={"purpose": "fine-tune"}
+)
+files = response.json()
+```
+
+### Retrieve File
+
+**GET** `/v1/openAi/v1/files/{file_id}`
+
+Returns information about a specific file.
+
+**Path Parameters:**
+- `file_id`: The ID of the file to retrieve
+
+**Response:**
+```json
+{
+  "id": "file-abc123",
+  "object": "file",
+  "bytes": 140,
+  "created_at": 1613779121,
+  "filename": "mydata.jsonl",
+  "purpose": "fine-tune"
+}
+```
+
+**Example:**
+```python
+import requests
+
+file_id = "file-abc123"
+response = requests.get(f"http://localhost:8000/v1/openAi/v1/files/{file_id}")
+file_info = response.json()
+```
+
+### Delete File
+
+**DELETE** `/v1/openAi/v1/files/{file_id}`
+
+Delete a file.
+
+**Path Parameters:**
+- `file_id`: The ID of the file to delete
+
+**Response:**
+```json
+{
+  "id": "file-abc123",
+  "object": "file",
+  "deleted": true
+}
+```
+
+**Example:**
+```python
+import requests
+
+file_id = "file-abc123"
+response = requests.delete(f"http://localhost:8000/v1/openAi/v1/files/{file_id}")
+result = response.json()
+```
+
+### Retrieve File Content
+
+**GET** `/v1/openAi/v1/files/{file_id}/content`
+
+Returns the raw file content as a binary response.
+
+**Path Parameters:**
+- `file_id`: The ID of the file to retrieve content from
+
+**Response:**
+Binary file content with appropriate headers:
+- `Content-Type`: `application/octet-stream`
+- `Content-Disposition`: `attachment; filename="filename"`
+
+**Example:**
+```python
+import requests
+
+file_id = "file-abc123"
+response = requests.get(f"http://localhost:8000/v1/openAi/v1/files/{file_id}/content")
+
+# Save content to file
+with open("downloaded_file.jsonl", "wb") as f:
+    f.write(response.content)
+
+# Or process content directly
+content = response.content
+```
+
+## Vector Store Integration
+
+The Files API integrates with Vector Stores to enable document processing and search. For detailed information about this integration, see [File Operations and Vector Store Integration](../concepts/file_operations_vector_stores.md).
+
+### Vector Store File Operations
+
+**List Vector Store Files:**
+- **GET** `/v1/openAi/v1/vector_stores/{vector_store_id}/files`
+
+**Retrieve Vector Store File Content:**
+- **GET** `/v1/openAi/v1/vector_stores/{vector_store_id}/files/{file_id}/content`
+
+**Attach File to Vector Store:**
+- **POST** `/v1/openAi/v1/vector_stores/{vector_store_id}/files`
+
+## Error Handling
+
+The Files API returns standard HTTP status codes and error responses:
+
+- `400 Bad Request`: Invalid request parameters
+- `404 Not Found`: File not found
+- `429 Too Many Requests`: Rate limit exceeded
+- `500 Internal Server Error`: Server error
+
+**Error Response Format:**
+```json
+{
+  "error": {
+    "message": "Error description",
+    "type": "invalid_request_error",
+    "code": "file_not_found"
+  }
+}
+```
+
+## Rate Limits
+
+The Files API implements rate limiting to ensure fair usage:
+- File uploads: 100 files per minute
+- File retrievals: 1000 requests per minute
+- File deletions: 100 requests per minute
+
+## Best Practices
+
+1. **File Organization**: Use descriptive filenames and appropriate purpose classifications
+2. **Batch Operations**: For multiple files, consider using batch endpoints when available
+3. **Error Handling**: Always check response status codes and handle errors gracefully
+4. **Content Types**: Ensure files are uploaded with appropriate content types
+5. **Cleanup**: Regularly delete unused files to manage storage costs
+
+## Integration Examples
+
+### With Python Client
+
+```python
+from llama_stack import LlamaStackClient
+
+client = LlamaStackClient("http://localhost:8000")
+
+# Upload a file
+with open("data.jsonl", "rb") as f:
+    file_info = await client.files.upload(file=f, purpose="fine-tune")
+
+# List files
+files = await client.files.list(purpose="fine-tune")
+
+# Retrieve file content
+content = await client.files.retrieve_content(file_info.id)
+```
+
+### With cURL
+
+```bash
+# Upload file
+curl -X POST http://localhost:8000/v1/openAi/v1/files \
+  -F "file=@data.jsonl" \
+  -F "purpose=fine-tune"
+
+# List files
+curl http://localhost:8000/v1/openAi/v1/files
+
+# Download file content
+curl http://localhost:8000/v1/openAi/v1/files/file-abc123/content \
+  -o downloaded_file.jsonl
+```
+
+## Provider Support
+
+The Files API supports multiple storage backends:
+
+- **Local Filesystem**: Store files on local disk (inline provider)
+- **S3**: Store files in AWS S3 or S3-compatible services (remote provider)
+- **Custom Backends**: Extensible architecture for custom storage providers
+
+See the [Files Providers](index.md) documentation for detailed configuration options.
--- a/docs/docs/providers/files/index.mdx
+++ b/docs/docs/providers/files/index.mdx
@ -1,7 +1,8 @@
 ---
-description: "Files
+description: |
+  Files

-    This API is used to upload documents that can be used with other Llama Stack APIs."
+      This API is used to upload documents that can be used with other Llama Stack APIs.
 sidebar_label: Files
 title: Files
 ---
--- a/Show more
+++ b/Show more
				`@ -0,0 +1 @@`
				`tests//recordings/ linguist-generated=true`