Release candidate 0.2.24-dev.20251003

2025-10-22 08:17:18 +00:00 · 2025-10-03 04:04:41 +00:00
2204 changed files with 142655 additions and 943207 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,19 +0,0 @@
-.venv
-__pycache__
-*.pyc
-*.pyo
-*.pyd
-*.so
-.git
-.gitignore
-htmlcov*
-.coverage
-coverage*
-.cache
-.mypy_cache
-.pytest_cache
-.ruff_cache
-uv.lock
-node_modules
-build
-/tmp
--- a/.gitattributes
+++ b/.gitattributes
@ -1 +0,0 @@
-tests/**/recordings/** linguist-generated=true
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@

 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1 @franciscojavierarceo
+* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1
--- a/.github/TRIAGERS.md
+++ b/.github/TRIAGERS.md
@ -1 +1,2 @@
 # This file documents Triage members in the Llama Stack community
+ @franciscojavierarceo
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -54,10 +54,6 @@ runs:
          SCRIPT_ARGS="$SCRIPT_ARGS --pattern ${{ inputs.pattern }}"
        fi

-        echo "=== Running command ==="
-        echo "uv run --no-sync ./scripts/integration-tests.sh $SCRIPT_ARGS"
-        echo ""
-
        uv run --no-sync ./scripts/integration-tests.sh $SCRIPT_ARGS | tee pytest-${{ inputs.inference-mode }}.log


@ -66,11 +62,11 @@ runs:
      shell: bash
      run: |
        echo "Checking for recording changes"
-        git status --porcelain tests/integration/
+        git status --porcelain tests/integration/recordings/

-        if [[ -n $(git status --porcelain tests/integration/) ]]; then
+        if [[ -n $(git status --porcelain tests/integration/recordings/) ]]; then
          echo "New recordings detected, committing and pushing"
-          git add tests/integration/
+          git add tests/integration/recordings/

          git commit -m "Recordings update from CI (suite: ${{ inputs.suite }})"
          git fetch origin ${{ github.ref_name }}
@ -82,13 +78,11 @@ runs:
          echo "No recording changes"
        fi

-    - name: Write docker logs to file
+    - name: Write inference logs to file
      if: ${{ always() }}
      shell: bash
      run: |
-        # Ollama logs (if ollama container exists)
-        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log 2>&1 || true
-        # Note: distro container logs are now dumped in integration-tests.sh before container is removed
+        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log || true

    - name: Upload logs
      if: ${{ always() }}
--- a/.github/actions/setup-test-environment/action.yml
+++ b/.github/actions/setup-test-environment/action.yml
@ -57,7 +57,7 @@ runs:
        echo "Building Llama Stack"

        LLAMA_STACK_DIR=. \
-          uv run --no-sync llama stack list-deps ci-tests | xargs -L1 uv pip install
+          uv run --no-sync llama stack build --template ci-tests --image-type venv

    - name: Configure git for commits
      shell: bash
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -12,9 +12,7 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suites from tests/integration in replay mode |
 | Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers |
 | Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
-| Pre-commit Bot | [precommit-trigger.yml](precommit-trigger.yml) | Pre-commit bot for PR |
 | Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
-| Test llama stack list-deps | [providers-list-deps.yml](providers-list-deps.yml) | Test llama stack list-deps |
 | Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
 | Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration |
 | Check semantic PR titles | [semantic-pr.yml](semantic-pr.yml) | Ensure that PR titles follow the conventional commit spec |
--- a/.github/workflows/conformance.yml
+++ b/.github/workflows/conformance.yml
@ -43,9 +43,9 @@ jobs:
      # Check if we should skip conformance testing due to breaking changes
      - name: Check if conformance test should be skipped
        id: skip-check
-        env:
-          PR_TITLE: ${{ github.event.pull_request.title }}
        run: |
+          PR_TITLE="${{ github.event.pull_request.title }}"
+
          # Skip if title contains "!:" indicating breaking change (like "feat!:")
          if [[ "$PR_TITLE" == *"!:"* ]]; then
            echo "skip=true" >> $GITHUB_OUTPUT
--- a/.github/workflows/install-script-ci.yml
+++ b/.github/workflows/install-script-ci.yml
@ -30,11 +30,8 @@ jobs:

      - name: Build a single provider
        run: |
-          docker build . \
-            -f containers/Containerfile \
-            --build-arg INSTALL_MODE=editable \
-            --build-arg DISTRO_NAME=starter \
-            --tag llama-stack:starter-ci
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync \
+            llama stack build --template starter --image-type container --image-name test

      - name: Run installer end-to-end
        run: |
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -73,24 +73,6 @@ jobs:
          image_name: kube
          apis: []
          providers: {}
-          storage:
-            backends:
-              kv_default:
-                type: kv_sqlite
-                db_path: $run_dir/kvstore.db
-              sql_default:
-                type: sql_sqlite
-                db_path: $run_dir/sql_store.db
-            stores:
-              metadata:
-                namespace: registry
-                backend: kv_default
-              inference:
-                table_name: inference_store
-                backend: sql_default
-              conversations:
-                table_name: openai_conversations
-                backend: sql_default
          server:
            port: 8321
          EOF
@ -102,16 +84,13 @@ jobs:
          yq eval '.server.auth.provider_config.jwks.token = "${{ env.TOKEN }}"' -i $run_dir/run.yaml
          cat $run_dir/run.yaml

-          # avoid line breaks in the server log, especially because we grep it below.
-          export LLAMA_STACK_LOG_WIDTH=200
-          nohup uv run llama stack run $run_dir/run.yaml > server.log 2>&1 &
+          nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &

      - name: Wait for Llama Stack server to be ready
        run: |
          echo "Waiting for Llama Stack server..."
          for i in {1..30}; do
-            # Note: /v1/health does not require authentication
-            if curl -s -L http://localhost:8321/v1/health | grep -q "OK"; then
+            if curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://localhost:8321/v1/health | grep -q "OK"; then
              echo "Llama Stack server is up!"
              if grep -q "Enabling authentication with provider: ${{ matrix.auth-provider }}" server.log; then
                echo "Llama Stack server is configured to use ${{ matrix.auth-provider }} auth"
@ -130,27 +109,4 @@ jobs:

      - name: Test auth
        run: |
-          echo "Testing /v1/version without token (should succeed)..."
-          if curl -s -L -o /dev/null -w "%{http_code}" http://127.0.0.1:8321/v1/version | grep -q "200"; then
-            echo "/v1/version accessible without token (200)"
-          else
-            echo "/v1/version returned non-200 status without token"
-            exit 1
-          fi
-
-          echo "Testing /v1/providers without token (should fail with 401)..."
-          if curl -s -L -o /dev/null -w "%{http_code}" http://127.0.0.1:8321/v1/providers | grep -q "401"; then
-            echo "/v1/providers blocked without token (401)"
-          else
-            echo "/v1/providers did not return 401 without token"
-            exit 1
-          fi
-
-          echo "Testing /v1/providers with valid token (should succeed)..."
-          curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers | jq
-          if [ $? -eq 0 ]; then
-            echo "/v1/providers accessible with valid token"
-          else
-            echo "/v1/providers failed with valid token"
-            exit 1
-          fi
+          curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers|jq
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -42,27 +42,18 @@ jobs:

  run-replay-mode-tests:
    runs-on: ubuntu-latest
-    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.config.setup, matrix.python-version, matrix.client-version, matrix.config.suite) }}
+    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.setup, matrix.python-version, matrix.client-version, matrix.suite) }}

    strategy:
      fail-fast: false
      matrix:
-        client-type: [library, server, docker]
+        client-type: [library, server]
+        # Use vllm on weekly schedule, otherwise use test-setup input (defaults to ollama)
+        setup: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-setup || 'ollama')) }}
        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
-        # Define (setup, suite) pairs - they are always matched and cannot be independent
-        # Weekly schedule (Sun 1 AM): vllm+base
-        # Input test-setup=ollama-vision: ollama-vision+vision
-        # Default (including test-setup=ollama): ollama+base, ollama-vision+vision, gpt+responses
-        config: >-
-          ${{
-            github.event.schedule == '1 0 * * 0'
-              && fromJSON('[{"setup": "vllm", "suite": "base"}]')
-            || github.event.inputs.test-setup == 'ollama-vision'
-              && fromJSON('[{"setup": "ollama-vision", "suite": "vision"}]')
-            || fromJSON('[{"setup": "ollama", "suite": "base"}, {"setup": "ollama-vision", "suite": "vision"}]')
-          }}
+        suite: [base, vision]

    steps:
      - name: Checkout repository
@ -73,16 +64,14 @@ jobs:
        with:
          python-version: ${{ matrix.python-version }}
          client-version: ${{ matrix.client-version }}
-          setup: ${{ matrix.config.setup }}
-          suite: ${{ matrix.config.suite }}
+          setup: ${{ matrix.setup }}
+          suite: ${{ matrix.suite }}
          inference-mode: 'replay'

      - name: Run tests
        uses: ./.github/actions/run-and-record-tests
-        env:
-          OPENAI_API_KEY: dummy
        with:
-          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || matrix.client-type == 'server' && 'server:ci-tests' || 'docker:ci-tests' }}
-          setup: ${{ matrix.config.setup }}
+          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
+          setup: ${{ matrix.setup }}
          inference-mode: 'replay'
-          suite: ${{ matrix.config.suite }}
+          suite: ${{ matrix.suite }}
--- a/.github/workflows/integration-vector-io-tests.yml
+++ b/.github/workflows/integration-vector-io-tests.yml
@ -144,7 +144,7 @@ jobs:

      - name: Build Llama Stack
        run: |
-          uv run --no-sync llama stack list-deps ci-tests | xargs -L1 uv pip install
+          uv run --no-sync llama stack build --template ci-tests --image-type venv

      - name: Check Storage and Memory Available Before Tests
        if: ${{ always() }}
@ -169,7 +169,8 @@ jobs:
        run: |
          uv run --no-sync \
            pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
-            tests/integration/vector_io
+            tests/integration/vector_io \
+            --embedding-model inline::sentence-transformers/all-MiniLM-L6-v2

      - name: Check Storage and Memory Available After Tests
        if: ${{ always() }}
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -37,7 +37,7 @@ jobs:
            .pre-commit-config.yaml

      - name: Set up Node.js
-        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
+        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
        with:
          node-version: '20'
          cache: 'npm'
--- a/.github/workflows/precommit-trigger.yml
+++ b/.github/workflows/precommit-trigger.yml
@ -1,227 +0,0 @@
-name: Pre-commit Bot
-
-run-name: Pre-commit bot for PR #${{ github.event.issue.number }}
-
-on:
-  issue_comment:
-    types: [created]
-
-jobs:
-  pre-commit:
-    # Only run on pull request comments
-    if: github.event.issue.pull_request && contains(github.event.comment.body, '@github-actions run precommit')
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-      pull-requests: write
-
-    steps:
-      - name: Check comment author and get PR details
-        id: check_author
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            // Get PR details
-            const pr = await github.rest.pulls.get({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              pull_number: context.issue.number
-            });
-
-            // Check if commenter has write access or is the PR author
-            const commenter = context.payload.comment.user.login;
-            const prAuthor = pr.data.user.login;
-
-            let hasPermission = false;
-
-            // Check if commenter is PR author
-            if (commenter === prAuthor) {
-              hasPermission = true;
-              console.log(`Comment author ${commenter} is the PR author`);
-            } else {
-              // Check if commenter has write/admin access
-              try {
-                const permission = await github.rest.repos.getCollaboratorPermissionLevel({
-                  owner: context.repo.owner,
-                  repo: context.repo.repo,
-                  username: commenter
-                });
-
-                const level = permission.data.permission;
-                hasPermission = ['write', 'admin', 'maintain'].includes(level);
-                console.log(`Comment author ${commenter} has permission: ${level}`);
-              } catch (error) {
-                console.log(`Could not check permissions for ${commenter}: ${error.message}`);
-              }
-            }
-
-            if (!hasPermission) {
-              await github.rest.issues.createComment({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                issue_number: context.issue.number,
-                body: `❌ @${commenter} You don't have permission to trigger pre-commit. Only PR authors or repository collaborators can run this command.`
-              });
-              core.setFailed(`User ${commenter} does not have permission`);
-              return;
-            }
-
-            // Save PR info for later steps
-            core.setOutput('pr_number', context.issue.number);
-            core.setOutput('pr_head_ref', pr.data.head.ref);
-            core.setOutput('pr_head_sha', pr.data.head.sha);
-            core.setOutput('pr_head_repo', pr.data.head.repo.full_name);
-            core.setOutput('pr_base_ref', pr.data.base.ref);
-            core.setOutput('is_fork', pr.data.head.repo.full_name !== context.payload.repository.full_name);
-            core.setOutput('authorized', 'true');
-
-      - name: React to comment
-        if: steps.check_author.outputs.authorized == 'true'
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            await github.rest.reactions.createForIssueComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              comment_id: context.payload.comment.id,
-              content: 'rocket'
-            });
-
-      - name: Comment starting
-        if: steps.check_author.outputs.authorized == 'true'
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            await github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: ${{ steps.check_author.outputs.pr_number }},
-              body: `⏳ Running [pre-commit hooks](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}) on PR #${{ steps.check_author.outputs.pr_number }}...`
-            });
-
-      - name: Checkout PR branch (same-repo)
-        if: steps.check_author.outputs.authorized == 'true' && steps.check_author.outputs.is_fork == 'false'
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-        with:
-          ref: ${{ steps.check_author.outputs.pr_head_ref }}
-          fetch-depth: 0
-          token: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Checkout PR branch (fork)
-        if: steps.check_author.outputs.authorized == 'true' && steps.check_author.outputs.is_fork == 'true'
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-        with:
-          repository: ${{ steps.check_author.outputs.pr_head_repo }}
-          ref: ${{ steps.check_author.outputs.pr_head_ref }}
-          fetch-depth: 0
-          token: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Verify checkout
-        if: steps.check_author.outputs.authorized == 'true'
-        run: |
-          echo "Current SHA: $(git rev-parse HEAD)"
-          echo "Expected SHA: ${{ steps.check_author.outputs.pr_head_sha }}"
-          if [[ "$(git rev-parse HEAD)" != "${{ steps.check_author.outputs.pr_head_sha }}" ]]; then
-            echo "::error::Checked out SHA does not match expected SHA"
-            exit 1
-          fi
-
-      - name: Set up Python
-        if: steps.check_author.outputs.authorized == 'true'
-        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
-        with:
-          python-version: '3.12'
-          cache: pip
-          cache-dependency-path: |
-            **/requirements*.txt
-            .pre-commit-config.yaml
-
-      - name: Set up Node.js
-        if: steps.check_author.outputs.authorized == 'true'
-        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
-        with:
-          node-version: '20'
-          cache: 'npm'
-          cache-dependency-path: 'llama_stack/ui/'
-
-      - name: Install npm dependencies
-        if: steps.check_author.outputs.authorized == 'true'
-        run: npm ci
-        working-directory: llama_stack/ui
-
-      - name: Run pre-commit
-        if: steps.check_author.outputs.authorized == 'true'
-        id: precommit
-        uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
-        continue-on-error: true
-        env:
-          SKIP: no-commit-to-branch
-          RUFF_OUTPUT_FORMAT: github
-
-      - name: Check for changes
-        if: steps.check_author.outputs.authorized == 'true'
-        id: changes
-        run: |
-          if ! git diff --exit-code || [ -n "$(git ls-files --others --exclude-standard)" ]; then
-            echo "has_changes=true" >> $GITHUB_OUTPUT
-            echo "Changes detected after pre-commit"
-          else
-            echo "has_changes=false" >> $GITHUB_OUTPUT
-            echo "No changes after pre-commit"
-          fi
-
-      - name: Commit and push changes
-        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'true'
-        run: |
-          git config --local user.email "github-actions[bot]@users.noreply.github.com"
-          git config --local user.name "github-actions[bot]"
-
-          git add -A
-          git commit -m "style: apply pre-commit fixes
-
-          🤖 Applied by @github-actions bot via pre-commit workflow"
-
-          # Push changes
-          git push origin HEAD:${{ steps.check_author.outputs.pr_head_ref }}
-
-      - name: Comment success with changes
-        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'true'
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            await github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: ${{ steps.check_author.outputs.pr_number }},
-              body: `✅ Pre-commit hooks completed successfully!\n\n🔧 Changes have been committed and pushed to the PR branch.`
-            });
-
-      - name: Comment success without changes
-        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'false' && steps.precommit.outcome == 'success'
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            await github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: ${{ steps.check_author.outputs.pr_number }},
-              body: `✅ Pre-commit hooks passed!\n\n✨ No changes needed - your code is already formatted correctly.`
-            });
-
-      - name: Comment failure
-        if: failure()
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            await github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: ${{ steps.check_author.outputs.pr_number }},
-              body: `❌ Pre-commit workflow failed!\n\nPlease check the [workflow logs](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}) for details.`
-            });
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -14,8 +14,6 @@ on:
      - '.github/workflows/providers-build.yml'
      - 'llama_stack/distributions/**'
      - 'pyproject.toml'
-      - 'containers/Containerfile'
-      - '.dockerignore'

  pull_request:
    paths:
@ -26,8 +24,6 @@ on:
      - '.github/workflows/providers-build.yml'
      - 'llama_stack/distributions/**'
      - 'pyproject.toml'
-      - 'containers/Containerfile'
-      - '.dockerignore'

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
@ -64,19 +60,15 @@ jobs:
      - name: Install dependencies
        uses: ./.github/actions/setup-runner

-      - name: Install distribution into venv
-        if: matrix.image-type == 'venv'
+      - name: Print build dependencies
        run: |
-          uv run llama stack list-deps ${{ matrix.distro }} | xargs -L1 uv pip install
+          uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only

-      - name: Build container image
-        if: matrix.image-type == 'container'
+      - name: Run Llama Stack Build
        run: |
-          docker build . \
-            -f containers/Containerfile \
-            --build-arg INSTALL_MODE=editable \
-            --build-arg DISTRO_NAME=${{ matrix.distro }} \
-            --tag llama-stack:${{ matrix.distro }}-ci
+          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
+          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test

      - name: Print dependencies in the image
        if: matrix.image-type == 'venv'
@ -94,8 +86,8 @@ jobs:

      - name: Build a single provider
        run: |
-          uv pip install -e .
-          uv run --no-sync llama stack list-deps --providers inference=remote::ollama | xargs -L1 uv pip install
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --image-type venv --image-name test --providers inference=remote::ollama
+
  build-custom-container-distribution:
    runs-on: ubuntu-latest
    steps:
@ -105,16 +97,11 @@ jobs:
      - name: Install dependencies
        uses: ./.github/actions/setup-runner

-      - name: Build container image
+      - name: Build a single provider
        run: |
-          BASE_IMAGE=$(yq -r '.distribution_spec.container_image // "python:3.12-slim"' llama_stack/distributions/ci-tests/build.yaml)
-          docker build . \
-            -f containers/Containerfile \
-            --build-arg INSTALL_MODE=editable \
-            --build-arg DISTRO_NAME=ci-tests \
-            --build-arg BASE_IMAGE="$BASE_IMAGE" \
-            --build-arg RUN_CONFIG_PATH=/workspace/llama_stack/distributions/ci-tests/run.yaml \
-            -t llama-stack:ci-tests
+          yq -i '.image_type = "container"' llama_stack/distributions/ci-tests/build.yaml
+          yq -i '.image_name = "test"' llama_stack/distributions/ci-tests/build.yaml
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml

      - name: Inspect the container image entrypoint
        run: |
@ -125,7 +112,7 @@ jobs:
          fi
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[/usr/local/bin/llama-stack-entrypoint.sh]" ]; then
+          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
@ -142,19 +129,17 @@ jobs:
      - name: Pin distribution to UBI9 base
        run: |
          yq -i '
+            .image_type    = "container" |
+            .image_name    = "ubi9-test" |
            .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
          ' llama_stack/distributions/ci-tests/build.yaml

-      - name: Build UBI9 container image
+      - name: Build dev container (UBI9)
+        env:
+          USE_COPY_NOT_MOUNT: "true"
+          LLAMA_STACK_DIR: "."
        run: |
-          BASE_IMAGE=$(yq -r '.distribution_spec.container_image // "registry.access.redhat.com/ubi9:latest"' llama_stack/distributions/ci-tests/build.yaml)
-          docker build . \
-            -f containers/Containerfile \
-            --build-arg INSTALL_MODE=editable \
-            --build-arg DISTRO_NAME=ci-tests \
-            --build-arg BASE_IMAGE="$BASE_IMAGE" \
-            --build-arg RUN_CONFIG_PATH=/workspace/llama_stack/distributions/ci-tests/run.yaml \
-            -t llama-stack:ci-tests-ubi9
+          uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml

      - name: Inspect UBI9 image
        run: |
@ -165,7 +150,7 @@ jobs:
          fi
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[/usr/local/bin/llama-stack-entrypoint.sh]" ]; then
+          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
--- a/.github/workflows/providers-list-deps.yml
+++ b/.github/workflows/providers-list-deps.yml
@ -1,105 +0,0 @@
-name: Test llama stack list-deps
-
-run-name: Test llama stack list-deps
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - 'llama_stack/cli/stack/list_deps.py'
-      - 'llama_stack/cli/stack/_list_deps.py'
-      - 'llama_stack/core/build.*'
-      - 'llama_stack/core/*.sh'
-      - '.github/workflows/providers-list-deps.yml'
-      - 'llama_stack/templates/**'
-      - 'pyproject.toml'
-
-  pull_request:
-    paths:
-      - 'llama_stack/cli/stack/list_deps.py'
-      - 'llama_stack/cli/stack/_list_deps.py'
-      - 'llama_stack/core/build.*'
-      - 'llama_stack/core/*.sh'
-      - '.github/workflows/providers-list-deps.yml'
-      - 'llama_stack/templates/**'
-      - 'pyproject.toml'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  generate-matrix:
-    runs-on: ubuntu-latest
-    outputs:
-      distros: ${{ steps.set-matrix.outputs.distros }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Generate Distribution List
-        id: set-matrix
-        run: |
-          distros=$(ls llama_stack/distributions/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
-          echo "distros=$distros" >> "$GITHUB_OUTPUT"
-
-  list-deps:
-    needs: generate-matrix
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        distro: ${{ fromJson(needs.generate-matrix.outputs.distros) }}
-        image-type: [venv, container]
-      fail-fast: false # We want to run all jobs even if some fail
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Print dependencies
-        run: |
-          uv run llama stack list-deps ${{ matrix.distro }}
-
-      - name: Install Distro using llama stack list-deps
-        run: |
-          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
-          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack list-deps ${{ matrix.distro }} | xargs -L1 uv pip install
-
-      - name: Print dependencies in the image
-        if: matrix.image-type == 'venv'
-        run: |
-          uv pip list
-
-  show-single-provider:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Show a single provider
-        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack list-deps --providers inference=remote::ollama
-
-  list-deps-from-config:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: list-des from Config
-        env:
-          USE_COPY_NOT_MOUNT: "true"
-          LLAMA_STACK_DIR: "."
-        run: |
-          uv run llama stack list-deps llama_stack/distributions/ci-tests/build.yaml
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -24,7 +24,7 @@ jobs:
      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0

    - name: Install uv
-      uses: astral-sh/setup-uv@3259c6206f993105e3a61b142c2d97bf4b9ef83d # v7.1.0
+      uses: astral-sh/setup-uv@b75a909f75acd358c2196fb9a5f1299a9a8868a4 # v6.7.0
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
@ -43,5 +43,7 @@ jobs:
        uv pip list
        uv pip show llama-stack
        command -v llama
+        llama model prompt-format -m Llama3.2-90B-Vision-Instruct
+        llama model list
        llama stack list-apis
        llama stack list-providers inference
--- a/.github/workflows/record-integration-tests.yml
+++ b/.github/workflows/record-integration-tests.yml
@ -61,9 +61,6 @@ jobs:

      - name: Run and record tests
        uses: ./.github/actions/run-and-record-tests
-        env:
-          # Set OPENAI_API_KEY if using gpt setup
-          OPENAI_API_KEY: ${{ inputs.test-setup == 'gpt' && secrets.OPENAI_API_KEY || '' }}
        with:
          stack-config: 'server:ci-tests'  # recording must be done with server since more tests are run
          setup: ${{ inputs.test-setup || 'ollama' }}
--- a/.github/workflows/stale_bot.yml
+++ b/.github/workflows/stale_bot.yml
@ -24,7 +24,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Stale Action
-        uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
+        uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
        with:
          stale-issue-label: 'stale'
          stale-issue-message: >
--- a/.github/workflows/test-external-provider-module.yml
+++ b/.github/workflows/test-external-provider-module.yml
@ -46,9 +46,9 @@ jobs:
          yq -i '.image_type = "${{ matrix.image-type }}"' tests/external/ramalama-stack/run.yaml
          cat tests/external/ramalama-stack/run.yaml

-      - name: Install distribution dependencies
+      - name: Build distro from config file
        run: |
-          uv run llama stack list-deps tests/external/ramalama-stack/build.yaml | xargs -L1 uv pip install
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/ramalama-stack/build.yaml

      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
@ -59,7 +59,7 @@ jobs:
          # Use the virtual environment created by the build step (name comes from build config)
          source ramalama-stack-test/bin/activate
          uv pip list
-          nohup llama stack run tests/external/ramalama-stack/run.yaml > server.log 2>&1 &
+          nohup llama stack run tests/external/ramalama-stack/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &

      - name: Wait for Llama Stack server to be ready
        run: |
--- a/.github/workflows/test-external.yml
+++ b/.github/workflows/test-external.yml
@ -44,14 +44,11 @@ jobs:

      - name: Print distro dependencies
        run: |
-          uv run --no-sync llama stack list-deps tests/external/build.yaml
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml --print-deps-only

      - name: Build distro from config file
        run: |
-          uv venv ci-test
-          source ci-test/bin/activate
-          uv pip install -e .
-          LLAMA_STACK_LOGGING=all=CRITICAL llama stack list-deps tests/external/build.yaml | xargs -L1 uv pip install
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml

      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
@ -62,7 +59,7 @@ jobs:
          # Use the virtual environment created by the build step (name comes from build config)
          source ci-test/bin/activate
          uv pip list
-          nohup llama stack run tests/external/run-byoa.yaml > server.log 2>&1 &
+          nohup llama stack run tests/external/run-byoa.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &

      - name: Wait for Llama Stack server to be ready
        run: |
--- a/.github/workflows/ui-unit-tests.yml
+++ b/.github/workflows/ui-unit-tests.yml
@ -29,7 +29,7 @@ jobs:
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0

      - name: Setup Node.js
-        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
+        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
        with:
          node-version: ${{ matrix.node-version }}
          cache: 'npm'
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -11,17 +11,14 @@ You can install the dependencies by running:

 ```bash
 cd llama-stack
-uv venv --python 3.12
 uv sync --group dev
 uv pip install -e .
 source .venv/bin/activate
 ```

 ```{note}
-If you are making changes to Llama Stack, it is essential that you use Python 3.12 as shown above.
-Llama Stack can work with Python 3.13 but the pre-commit hooks used to validate code changes only work with Python 3.12.
-If you don't specify a Python version, `uv` will automatically select a Python version according to the `requires-python`
-section of the `pyproject.toml`, which is fine for running Llama Stack but not for committing changes.
+You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.12`).
+Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
 For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
 ```

@ -45,22 +42,17 @@ uv run --env-file .env -- pytest -v tests/integration/inference/test_text_infere
 We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:

 ```bash
-uv pip install pre-commit==4.3.0
 uv run pre-commit install
 ```

-Note that the only version of pre-commit that works with the Llama Stack continuous integration is `4.3.0` so it is essential that you pull
-that specific version as shown above.  Once you have run these commands, pre-commit hooks will run automatically before each commit.
+After that, pre-commit hooks will run automatically before each commit.

-Alternatively, if you don't want to install the pre-commit hooks (or if you want to check if your changes are ready before committing),
-you can run the checks manually by running:
+Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:

 ```bash
-uv run pre-commit run --all-files -v
+uv run pre-commit run --all-files
 ```

-The `-v` (verbose) parameter is optional but often helpful for getting more information about any issues with that the pre-commit checks identify.
-
 ```{caution}
 Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
 ```
@ -91,7 +83,6 @@ If you are new to the project, start by looking at the issues tagged with "good
 leave a comment on the issue and a triager will assign it to you.

 Please avoid picking up too many issues at once. This helps you stay focused and ensures that others in the community also have opportunities to contribute.
-
 - Try to work on only 1–2 issues at a time, especially if you’re still getting familiar with the codebase.
 - Before taking an issue, check if it’s already assigned or being actively discussed.
 - If you’re blocked or can’t continue with an issue, feel free to unassign yourself or leave a comment so others can step in.
@ -167,9 +158,9 @@ under the LICENSE file in the root directory of this source tree.

 Some tips about common tasks you work on while contributing to Llama Stack:

-### Installing dependencies of distributions
+### Using `llama stack build`

-When installing dependencies for a distribution, you can use `llama stack list-deps` to view and install the required packages.
+Building a stack image will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.

 Example:
 ```bash
@ -177,12 +168,7 @@ cd work/
 git clone https://github.com/llamastack/llama-stack.git
 git clone https://github.com/llamastack/llama-stack-client-python.git
 cd llama-stack
-
-# Show dependencies for a distribution
-llama stack list-deps <distro-name>
-
-# Install dependencies
-llama stack list-deps <distro-name> | xargs -L1 uv pip install
+LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --distro <...>
 ```

 ### Updating distribution configurations
@ -205,7 +191,6 @@ If you are making changes to the documentation at [https://llamastack.github.io/

 ```bash
 # This rebuilds the documentation pages and the OpenAPI spec.
-cd docs/
 npm install
 npm run gen-api-docs all
 npm run build
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@
 [![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
 [![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)

-[**Quick Start**](https://llamastack.github.io/docs/getting_started/quickstart) | [**Documentation**](https://llamastack.github.io/docs) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
+[**Quick Start**](https://llamastack.github.io/latest/getting_started/index.html) | [**Documentation**](https://llamastack.github.io/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)


 ### ✨🎉 Llama 4 Support  🎉✨
@ -25,13 +25,10 @@ pip install -U llama_stack

 MODEL="Llama-4-Scout-17B-16E-Instruct"
 # get meta url from llama.com
-huggingface-cli download meta-llama/$MODEL --local-dir ~/.llama/$MODEL
-
-# install dependencies for the distribution
-llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
+llama model download --source meta --model-id $MODEL --meta-url <META_URL>

 # start a llama stack server
-INFERENCE_MODEL=meta-llama/$MODEL llama stack run meta-reference-gpu
+INFERENCE_MODEL=meta-llama/$MODEL llama stack build --run --template meta-reference-gpu

 # install client to interact with the server
 pip install llama-stack-client
@ -92,7 +89,7 @@ As more providers start supporting Llama 4, you can use them in Llama Stack as w
 To try Llama Stack locally, run:

 ```bash
-curl -LsSf https://github.com/llamastack/llama-stack/raw/main/scripts/install.sh | bash
+curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/scripts/install.sh | bash
 ```

 ### Overview
--- a/benchmarking/k8s-benchmark/stack-configmap.yaml
+++ b/benchmarking/k8s-benchmark/stack-configmap.yaml
@ -98,34 +98,25 @@ data:
      - provider_id: model-context-protocol
        provider_type: remote::model-context-protocol
        config: {}
-    storage:
-      backends:
-        kv_default:
-          type: kv_postgres
-          host: ${env.POSTGRES_HOST:=localhost}
-          port: ${env.POSTGRES_PORT:=5432}
-          db: ${env.POSTGRES_DB:=llamastack}
-          user: ${env.POSTGRES_USER:=llamastack}
-          password: ${env.POSTGRES_PASSWORD:=llamastack}
-          table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
-        sql_default:
-          type: sql_postgres
-          host: ${env.POSTGRES_HOST:=localhost}
-          port: ${env.POSTGRES_PORT:=5432}
-          db: ${env.POSTGRES_DB:=llamastack}
-          user: ${env.POSTGRES_USER:=llamastack}
-          password: ${env.POSTGRES_PASSWORD:=llamastack}
-      references:
-        metadata:
-          backend: kv_default
-          namespace: registry
-        inference:
-          backend: sql_default
-          table_name: inference_store
+    metadata_store:
+      type: postgres
+      host: ${env.POSTGRES_HOST:=localhost}
+      port: ${env.POSTGRES_PORT:=5432}
+      db: ${env.POSTGRES_DB:=llamastack}
+      user: ${env.POSTGRES_USER:=llamastack}
+      password: ${env.POSTGRES_PASSWORD:=llamastack}
+      table_name: llamastack_kvstore
+    inference_store:
+      type: postgres
+      host: ${env.POSTGRES_HOST:=localhost}
+      port: ${env.POSTGRES_PORT:=5432}
+      db: ${env.POSTGRES_DB:=llamastack}
+      user: ${env.POSTGRES_USER:=llamastack}
+      password: ${env.POSTGRES_PASSWORD:=llamastack}
    models:
    - metadata:
-        embedding_dimension: 768
-      model_id: nomic-embed-text-v1.5
+        embedding_dimension: 384
+      model_id: all-MiniLM-L6-v2
      provider_id: sentence-transformers
      model_type: embedding
    - model_id: ${env.INFERENCE_MODEL}
@ -146,4 +137,5 @@ data:
      port: 8323
 kind: ConfigMap
 metadata:
+  creationTimestamp: null
  name: llama-stack-config
--- a/benchmarking/k8s-benchmark/stack_run_config.yaml
+++ b/benchmarking/k8s-benchmark/stack_run_config.yaml
@ -95,34 +95,25 @@ providers:
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
    config: {}
-storage:
-  backends:
-    kv_default:
-      type: kv_postgres
-      host: ${env.POSTGRES_HOST:=localhost}
-      port: ${env.POSTGRES_PORT:=5432}
-      db: ${env.POSTGRES_DB:=llamastack}
-      user: ${env.POSTGRES_USER:=llamastack}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
-      table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
-    sql_default:
-      type: sql_postgres
-      host: ${env.POSTGRES_HOST:=localhost}
-      port: ${env.POSTGRES_PORT:=5432}
-      db: ${env.POSTGRES_DB:=llamastack}
-      user: ${env.POSTGRES_USER:=llamastack}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
-  references:
-    metadata:
-      backend: kv_default
-      namespace: registry
-    inference:
-      backend: sql_default
-      table_name: inference_store
+metadata_store:
+  type: postgres
+  host: ${env.POSTGRES_HOST:=localhost}
+  port: ${env.POSTGRES_PORT:=5432}
+  db: ${env.POSTGRES_DB:=llamastack}
+  user: ${env.POSTGRES_USER:=llamastack}
+  password: ${env.POSTGRES_PASSWORD:=llamastack}
+  table_name: llamastack_kvstore
+inference_store:
+  type: postgres
+  host: ${env.POSTGRES_HOST:=localhost}
+  port: ${env.POSTGRES_PORT:=5432}
+  db: ${env.POSTGRES_DB:=llamastack}
+  user: ${env.POSTGRES_USER:=llamastack}
+  password: ${env.POSTGRES_PASSWORD:=llamastack}
 models:
 - metadata:
-    embedding_dimension: 768
-  model_id: nomic-embed-text-v1.5
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
  provider_id: sentence-transformers
  model_type: embedding
 - model_id: ${env.INFERENCE_MODEL}
--- a/client-sdks/stainless/README.md
+++ b/client-sdks/stainless/README.md
@ -1,8 +0,0 @@
-These are the source-of-truth configuration files used to generate the Stainless client SDKs via Stainless.
-
- `openapi.yml`: this is the OpenAPI specification for the Llama Stack API.
- `openapi.stainless.yml`: this is the Stainless _configuration_ which instructs Stainless how to generate the client SDKs.
-
-A small side note: notice the `.yml` suffixes since Stainless uses that suffix typically for its configuration files.
-
-These files go hand-in-hand. As of now, only the `openapi.yml` file is automatically generated using the `run_openapi_generator.sh` script.
--- a/client-sdks/stainless/openapi.stainless.yml
+++ b/client-sdks/stainless/openapi.stainless.yml
@ -1,610 +0,0 @@
-# yaml-language-server: $schema=https://app.stainlessapi.com/config-internal.schema.json
-
-organization:
-  # Name of your organization or company, used to determine the name of the client
-  # and headings.
-  name: llama-stack-client
-  docs: https://llama-stack.readthedocs.io/en/latest/
-  contact: llamastack@meta.com
-security:
-  - {}
-  - BearerAuth: []
-security_schemes:
-  BearerAuth:
-    type: http
-    scheme: bearer
-# `targets` define the output targets and their customization options, such as
-# whether to emit the Node SDK and what it's package name should be.
-targets:
-  node:
-    package_name: llama-stack-client
-    production_repo: llamastack/llama-stack-client-typescript
-    publish:
-      npm: false
-  python:
-    package_name: llama_stack_client
-    production_repo: llamastack/llama-stack-client-python
-    options:
-      use_uv: true
-    publish:
-      pypi: true
-    project_name: llama_stack_client
-  kotlin:
-    reverse_domain: com.llama_stack_client.api
-    production_repo: null
-    publish:
-      maven: false
-  go:
-    package_name: llama-stack-client
-    production_repo: llamastack/llama-stack-client-go
-    options:
-      enable_v2: true
-      back_compat_use_shared_package: false
-
-# `client_settings` define settings for the API client, such as extra constructor
-# arguments (used for authentication), retry behavior, idempotency, etc.
-client_settings:
-  default_env_prefix: LLAMA_STACK_CLIENT
-  opts:
-    api_key:
-      type: string
-      read_env: LLAMA_STACK_CLIENT_API_KEY
-      auth: { security_scheme: BearerAuth }
-      nullable: true
-
-# `environments` are a map of the name of the environment (e.g. "sandbox",
-# "production") to the corresponding url to use.
-environments:
-  production: http://any-hosted-llama-stack.com
-
-# `pagination` defines [pagination schemes] which provides a template to match
-# endpoints and generate next-page and auto-pagination helpers in the SDKs.
-pagination:
-  - name: datasets_iterrows
-    type: offset
-    request:
-      dataset_id:
-        type: string
-      start_index:
-        type: integer
-        x-stainless-pagination-property:
-          purpose: offset_count_param
-      limit:
-        type: integer
-    response:
-      data:
-        type: array
-        items:
-          type: object
-      next_index:
-        type: integer
-        x-stainless-pagination-property:
-          purpose: offset_count_start_field
-  - name: openai_cursor_page
-    type: cursor
-    request:
-      limit:
-        type: integer
-      after:
-        type: string
-        x-stainless-pagination-property:
-          purpose: next_cursor_param
-    response:
-      data:
-        type: array
-        items: {}
-      has_more:
-        type: boolean
-      last_id:
-        type: string
-        x-stainless-pagination-property:
-          purpose: next_cursor_field
-# `resources` define the structure and organziation for your API, such as how
-# methods and models are grouped together and accessed. See the [configuration
-# guide] for more information.
-#
-# [configuration guide]:
-#   https://app.stainlessapi.com/docs/guides/configure#resources
-resources:
-  $shared:
-    models:
-      agent_config: AgentConfig
-      interleaved_content_item: InterleavedContentItem
-      interleaved_content: InterleavedContent
-      param_type: ParamType
-      safety_violation: SafetyViolation
-      sampling_params: SamplingParams
-      scoring_result: ScoringResult
-      message: Message
-      user_message: UserMessage
-      completion_message: CompletionMessage
-      tool_response_message: ToolResponseMessage
-      system_message: SystemMessage
-      tool_call: ToolCall
-      query_result: RAGQueryResult
-      document: RAGDocument
-      query_config: RAGQueryConfig
-      response_format: ResponseFormat
-  toolgroups:
-    models:
-      tool_group: ToolGroup
-      list_tool_groups_response: ListToolGroupsResponse
-    methods:
-      register: post /v1/toolgroups
-      get: get /v1/toolgroups/{toolgroup_id}
-      list: get /v1/toolgroups
-      unregister: delete /v1/toolgroups/{toolgroup_id}
-  tools:
-    methods:
-      get: get /v1/tools/{tool_name}
-      list:
-        endpoint: get /v1/tools
-        paginated: false
-
-  tool_runtime:
-    models:
-      tool_def: ToolDef
-      tool_invocation_result: ToolInvocationResult
-    methods:
-      list_tools:
-        endpoint: get /v1/tool-runtime/list-tools
-        paginated: false
-      invoke_tool: post /v1/tool-runtime/invoke
-    subresources:
-      rag_tool:
-        methods:
-          insert: post /v1/tool-runtime/rag-tool/insert
-          query: post /v1/tool-runtime/rag-tool/query
-
-  responses:
-    models:
-      response_object_stream: OpenAIResponseObjectStream
-      response_object: OpenAIResponseObject
-    methods:
-      create:
-        type: http
-        endpoint: post /v1/responses
-        streaming:
-          stream_event_model: responses.response_object_stream
-          param_discriminator: stream
-      retrieve: get /v1/responses/{response_id}
-      list:
-        type: http
-        endpoint: get /v1/responses
-      delete:
-        type: http
-        endpoint: delete /v1/responses/{response_id}
-    subresources:
-      input_items:
-        methods:
-          list:
-            type: http
-            endpoint: get /v1/responses/{response_id}/input_items
-
-  conversations:
-    models:
-      conversation_object: Conversation
-    methods:
-      create:
-        type: http
-        endpoint: post /v1/conversations
-      retrieve: get /v1/conversations/{conversation_id}
-      update:
-        type: http
-        endpoint: post /v1/conversations/{conversation_id}
-      delete:
-        type: http
-        endpoint: delete /v1/conversations/{conversation_id}
-    subresources:
-      items:
-        methods:
-          get:
-            type: http
-            endpoint: get /v1/conversations/{conversation_id}/items/{item_id}
-          list:
-            type: http
-            endpoint: get /v1/conversations/{conversation_id}/items
-          create:
-            type: http
-            endpoint: post /v1/conversations/{conversation_id}/items
-
-  inspect:
-    models:
-      healthInfo: HealthInfo
-      providerInfo: ProviderInfo
-      routeInfo: RouteInfo
-      versionInfo: VersionInfo
-    methods:
-      health: get /v1/health
-      version: get /v1/version
-
-  embeddings:
-    models:
-      create_embeddings_response: OpenAIEmbeddingsResponse
-    methods:
-      create: post /v1/embeddings
-
-  chat:
-    models:
-      chat_completion_chunk: OpenAIChatCompletionChunk
-    subresources:
-      completions:
-        methods:
-          create:
-            type: http
-            endpoint: post /v1/chat/completions
-            streaming:
-              stream_event_model: chat.chat_completion_chunk
-              param_discriminator: stream
-          list:
-            type: http
-            endpoint: get /v1/chat/completions
-          retrieve:
-            type: http
-            endpoint: get /v1/chat/completions/{completion_id}
-  completions:
-    methods:
-      create:
-        type: http
-        endpoint: post /v1/completions
-        streaming:
-          param_discriminator: stream
-
-  vector_io:
-    models:
-      queryChunksResponse: QueryChunksResponse
-    methods:
-      insert: post /v1/vector-io/insert
-      query: post /v1/vector-io/query
-
-  vector_stores:
-    models:
-      vector_store: VectorStoreObject
-      list_vector_stores_response: VectorStoreListResponse
-      vector_store_delete_response: VectorStoreDeleteResponse
-      vector_store_search_response: VectorStoreSearchResponsePage
-    methods:
-      create: post /v1/vector_stores
-      list:
-        endpoint: get /v1/vector_stores
-      retrieve: get /v1/vector_stores/{vector_store_id}
-      update: post /v1/vector_stores/{vector_store_id}
-      delete: delete /v1/vector_stores/{vector_store_id}
-      search: post /v1/vector_stores/{vector_store_id}/search
-    subresources:
-      files:
-        models:
-          vector_store_file: VectorStoreFileObject
-        methods:
-          list: get /v1/vector_stores/{vector_store_id}/files
-          retrieve: get /v1/vector_stores/{vector_store_id}/files/{file_id}
-          update: post /v1/vector_stores/{vector_store_id}/files/{file_id}
-          delete: delete /v1/vector_stores/{vector_store_id}/files/{file_id}
-          create: post /v1/vector_stores/{vector_store_id}/files
-          content: get /v1/vector_stores/{vector_store_id}/files/{file_id}/content
-      file_batches:
-        models:
-          vector_store_file_batches: VectorStoreFileBatchObject
-          list_vector_store_files_in_batch_response: VectorStoreFilesListInBatchResponse
-        methods:
-          create: post /v1/vector_stores/{vector_store_id}/file_batches
-          retrieve: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}
-          list_files: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/files
-          cancel: post /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel
-
-  models:
-    models:
-      model: Model
-      list_models_response: ListModelsResponse
-    methods:
-      retrieve: get /v1/models/{model_id}
-      list:
-        endpoint: get /v1/models
-        paginated: false
-      register: post /v1/models
-      unregister: delete /v1/models/{model_id}
-    subresources:
-      openai:
-        methods:
-          list:
-            endpoint: get /v1/models
-            paginated: false
-
-  providers:
-    models:
-      list_providers_response: ListProvidersResponse
-    methods:
-      list:
-        endpoint: get /v1/providers
-        paginated: false
-      retrieve: get /v1/providers/{provider_id}
-
-  routes:
-    models:
-      list_routes_response: ListRoutesResponse
-    methods:
-      list:
-        endpoint: get /v1/inspect/routes
-        paginated: false
-
-
-  moderations:
-    models:
-      create_response: ModerationObject
-    methods:
-      create: post /v1/moderations
-
-
-  safety:
-    models:
-      run_shield_response: RunShieldResponse
-    methods:
-      run_shield: post /v1/safety/run-shield
-
-
-  shields:
-    models:
-      shield: Shield
-      list_shields_response: ListShieldsResponse
-    methods:
-      retrieve: get /v1/shields/{identifier}
-      list:
-        endpoint: get /v1/shields
-        paginated: false
-      register: post /v1/shields
-      delete: delete /v1/shields/{identifier}
-
-  synthetic_data_generation:
-    models:
-      syntheticDataGenerationResponse: SyntheticDataGenerationResponse
-    methods:
-      generate: post /v1/synthetic-data-generation/generate
-
-  telemetry:
-    models:
-      span_with_status: SpanWithStatus
-      trace: Trace
-      query_spans_response: QuerySpansResponse
-      event: Event
-      query_condition: QueryCondition
-    methods:
-      query_traces:
-        endpoint: post /v1alpha/telemetry/traces
-        skip_test_reason: 'unsupported query params in java / kotlin'
-      get_span_tree: post /v1alpha/telemetry/spans/{span_id}/tree
-      query_spans:
-        endpoint: post /v1alpha/telemetry/spans
-        skip_test_reason: 'unsupported query params in java / kotlin'
-      query_metrics:
-        endpoint: post /v1alpha/telemetry/metrics/{metric_name}
-        skip_test_reason: 'unsupported query params in java / kotlin'
-      # log_event: post /v1alpha/telemetry/events
-      save_spans_to_dataset: post /v1alpha/telemetry/spans/export
-      get_span: get /v1alpha/telemetry/traces/{trace_id}/spans/{span_id}
-      get_trace: get /v1alpha/telemetry/traces/{trace_id}
-
-  scoring:
-    methods:
-      score: post /v1/scoring/score
-      score_batch: post /v1/scoring/score-batch
-  scoring_functions:
-    methods:
-      retrieve: get /v1/scoring-functions/{scoring_fn_id}
-      list:
-        endpoint: get /v1/scoring-functions
-        paginated: false
-      register: post /v1/scoring-functions
-    models:
-      scoring_fn: ScoringFn
-      scoring_fn_params: ScoringFnParams
-      list_scoring_functions_response: ListScoringFunctionsResponse
-
-  benchmarks:
-    methods:
-      retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}
-      list:
-        endpoint: get /v1alpha/eval/benchmarks
-        paginated: false
-      register: post /v1alpha/eval/benchmarks
-    models:
-      benchmark: Benchmark
-      list_benchmarks_response: ListBenchmarksResponse
-
-  files:
-    methods:
-      create: post /v1/files
-      list: get /v1/files
-      retrieve: get /v1/files/{file_id}
-      delete: delete /v1/files/{file_id}
-      content: get /v1/files/{file_id}/content
-    models:
-      file: OpenAIFileObject
-      list_files_response: ListOpenAIFileResponse
-      delete_file_response: OpenAIFileDeleteResponse
-
-  alpha:
-    subresources:
-      inference:
-        methods:
-          rerank: post /v1alpha/inference/rerank
-
-      post_training:
-        models:
-          algorithm_config: AlgorithmConfig
-          post_training_job: PostTrainingJob
-          list_post_training_jobs_response: ListPostTrainingJobsResponse
-        methods:
-          preference_optimize: post /v1alpha/post-training/preference-optimize
-          supervised_fine_tune: post /v1alpha/post-training/supervised-fine-tune
-        subresources:
-          job:
-            methods:
-              artifacts: get /v1alpha/post-training/job/artifacts
-              cancel: post /v1alpha/post-training/job/cancel
-              status: get /v1alpha/post-training/job/status
-              list:
-                endpoint: get /v1alpha/post-training/jobs
-                paginated: false
-
-      eval:
-        methods:
-          evaluate_rows: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
-          run_eval: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
-          evaluate_rows_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
-          run_eval_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
-
-        subresources:
-          jobs:
-            methods:
-              cancel: delete /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
-              status: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
-              retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result
-        models:
-          evaluate_response: EvaluateResponse
-          benchmark_config: BenchmarkConfig
-          job: Job
-
-      agents:
-        methods:
-          create: post /v1alpha/agents
-          list: get /v1alpha/agents
-          retrieve: get /v1alpha/agents/{agent_id}
-          delete: delete /v1alpha/agents/{agent_id}
-        models:
-          inference_step: InferenceStep
-          tool_execution_step: ToolExecutionStep
-          tool_response: ToolResponse
-          shield_call_step: ShieldCallStep
-          memory_retrieval_step: MemoryRetrievalStep
-        subresources:
-          session:
-            models:
-              session: Session
-            methods:
-              list: get /v1alpha/agents/{agent_id}/sessions
-              create: post /v1alpha/agents/{agent_id}/session
-              delete: delete /v1alpha/agents/{agent_id}/session/{session_id}
-              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}
-          steps:
-            methods:
-              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}
-          turn:
-            models:
-              turn: Turn
-              turn_response_event: AgentTurnResponseEvent
-              agent_turn_response_stream_chunk: AgentTurnResponseStreamChunk
-            methods:
-              create:
-                type: http
-                endpoint: post /v1alpha/agents/{agent_id}/session/{session_id}/turn
-                streaming:
-                  stream_event_model: alpha.agents.turn.agent_turn_response_stream_chunk
-                  param_discriminator: stream
-              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}
-              resume:
-                type: http
-                endpoint: post /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume
-                streaming:
-                  stream_event_model: alpha.agents.turn.agent_turn_response_stream_chunk
-                  param_discriminator: stream
-
-  beta:
-    subresources:
-      datasets:
-        models:
-          list_datasets_response: ListDatasetsResponse
-        methods:
-          register: post /v1beta/datasets
-          retrieve: get /v1beta/datasets/{dataset_id}
-          list:
-            endpoint: get /v1beta/datasets
-            paginated: false
-          unregister: delete /v1beta/datasets/{dataset_id}
-          iterrows: get /v1beta/datasetio/iterrows/{dataset_id}
-          appendrows: post /v1beta/datasetio/append-rows/{dataset_id}
-
-
-settings:
-  license: MIT
-  unwrap_response_fields: [ data ]
-
-openapi:
-  transformations:
-    - command: renameValue
-      reason: pydantic reserved name
-      args:
-        filter:
-          only:
-            - '$.components.schemas.InferenceStep.properties.model_response'
-        rename:
-          python:
-            property_name: 'inference_model_response'
-
-    # - command: renameValue
-    #   reason: pydantic reserved name
-    #   args:
-    #     filter:
-    #       only:
-    #         - '$.components.schemas.Model.properties.model_type'
-    #     rename:
-    #       python:
-    #         property_name: 'type'
-    - command: mergeObject
-      reason: Better return_type using enum
-      args:
-        target:
-          - '$.components.schemas'
-        object:
-          ReturnType:
-            additionalProperties: false
-            properties:
-              type:
-                enum:
-                  - string
-                  - number
-                  - boolean
-                  - array
-                  - object
-                  - json
-                  - union
-                  - chat_completion_input
-                  - completion_input
-                  - agent_turn_input
-            required:
-              - type
-            type: object
-    - command: replaceProperties
-      reason: Replace return type properties with better model (see above)
-      args:
-        filter:
-          only:
-            - '$.components.schemas.ScoringFn.properties.return_type'
-            - '$.components.schemas.RegisterScoringFunctionRequest.properties.return_type'
-        value:
-          $ref: '#/components/schemas/ReturnType'
-    - command: oneOfToAnyOf
-      reason: Prism (mock server) doesn't like one of our requests as it technically matches multiple variants
-    - reason: For better names
-      command: extractToRefs
-      args:
-        ref:
-          target: '$.components.schemas.ToolCallDelta.properties.tool_call'
-          name: '#/components/schemas/ToolCallOrString'
-
-# `readme` is used to configure the code snippets that will be rendered in the
-# README.md of various SDKs. In particular, you can change the `headline`
-# snippet's endpoint and the arguments to call it with.
-readme:
-  example_requests:
-    default:
-      type: request
-      endpoint: post /v1/chat/completions
-      params: &ref_0 {}
-    headline:
-      type: request
-      endpoint: post /v1/models
-      params: *ref_0
-    pagination:
-      type: request
-      endpoint: post /v1/chat/completions
-      params: {}
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
--- a/containers/Containerfile
+++ b/containers/Containerfile
@ -1,137 +0,0 @@
-# syntax=docker/dockerfile:1.6
-#
-# This Dockerfile is used to build the Llama Stack container image.
-# Example:
-# docker build \
-#   -f containers/Containerfile \
-#   --build-arg DISTRO_NAME=starter \
-#   --tag llama-stack:starter .
-
-ARG BASE_IMAGE=python:3.12-slim
-FROM ${BASE_IMAGE}
-
-ARG INSTALL_MODE="pypi"
-ARG LLAMA_STACK_DIR="/workspace"
-ARG LLAMA_STACK_CLIENT_DIR=""
-ARG PYPI_VERSION=""
-ARG TEST_PYPI_VERSION=""
-ARG KEEP_WORKSPACE=""
-ARG DISTRO_NAME="starter"
-ARG RUN_CONFIG_PATH=""
-ARG UV_HTTP_TIMEOUT=500
-ENV UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT}
-ENV PYTHONDONTWRITEBYTECODE=1
-ENV PIP_DISABLE_PIP_VERSION_CHECK=1
-WORKDIR /app
-
-RUN set -eux; \
-    if command -v dnf >/dev/null 2>&1; then \
-        dnf -y update && \
-        dnf install -y iputils git net-tools wget \
-            vim-minimal python3.12 python3.12-pip python3.12-wheel \
-            python3.12-setuptools python3.12-devel gcc gcc-c++ make && \
-        ln -sf /usr/bin/pip3.12 /usr/local/bin/pip && \
-        ln -sf /usr/bin/python3.12 /usr/local/bin/python && \
-        dnf clean all; \
-    elif command -v apt-get >/dev/null 2>&1; then \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            iputils-ping net-tools iproute2 dnsutils telnet \
-            curl wget git procps psmisc lsof traceroute bubblewrap \
-            gcc g++ && \
-        rm -rf /var/lib/apt/lists/*; \
-    else \
-        echo "Unsupported base image: expected dnf or apt-get" >&2; \
-        exit 1; \
-    fi
-
-RUN pip install --no-cache-dir uv
-ENV UV_SYSTEM_PYTHON=1
-
-ENV INSTALL_MODE=${INSTALL_MODE}
-ENV LLAMA_STACK_DIR=${LLAMA_STACK_DIR}
-ENV LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR}
-ENV PYPI_VERSION=${PYPI_VERSION}
-ENV TEST_PYPI_VERSION=${TEST_PYPI_VERSION}
-ENV KEEP_WORKSPACE=${KEEP_WORKSPACE}
-ENV DISTRO_NAME=${DISTRO_NAME}
-ENV RUN_CONFIG_PATH=${RUN_CONFIG_PATH}
-
-# Copy the repository so editable installs and run configurations are available.
-COPY . /workspace
-
-# Install the client package if it is provided
-# NOTE: this is installed before llama-stack since llama-stack depends on llama-stack-client-python
-RUN set -eux; \
-    if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then \
-        if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then \
-            echo "LLAMA_STACK_CLIENT_DIR is set but $LLAMA_STACK_CLIENT_DIR does not exist" >&2; \
-            exit 1; \
-        fi; \
-        uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"; \
-    fi;
-
-# Install llama-stack
-RUN set -eux; \
-    if [ "$INSTALL_MODE" = "editable" ]; then \
-        if [ ! -d "$LLAMA_STACK_DIR" ]; then \
-            echo "INSTALL_MODE=editable requires LLAMA_STACK_DIR to point to a directory inside the build context" >&2; \
-            exit 1; \
-        fi; \
-        uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"; \
-    elif [ "$INSTALL_MODE" = "test-pypi" ]; then \
-        uv pip install --no-cache-dir fastapi libcst; \
-        if [ -n "$TEST_PYPI_VERSION" ]; then \
-            uv pip install --no-cache-dir --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match "llama-stack==$TEST_PYPI_VERSION"; \
-        else \
-            uv pip install --no-cache-dir --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match llama-stack; \
-        fi; \
-    else \
-        if [ -n "$PYPI_VERSION" ]; then \
-            uv pip install --no-cache-dir "llama-stack==$PYPI_VERSION"; \
-        else \
-            uv pip install --no-cache-dir llama-stack; \
-        fi; \
-    fi;
-
-# Install the dependencies for the distribution
-RUN set -eux; \
-    if [ -z "$DISTRO_NAME" ]; then \
-        echo "DISTRO_NAME must be provided" >&2; \
-        exit 1; \
-    fi; \
-    deps="$(llama stack list-deps "$DISTRO_NAME")"; \
-    if [ -n "$deps" ]; then \
-        printf '%s\n' "$deps" | xargs -L1 uv pip install --no-cache-dir; \
-    fi
-
-# Cleanup
-RUN set -eux; \
-    pip uninstall -y uv; \
-    should_remove=1; \
-    if [ -n "$KEEP_WORKSPACE" ]; then should_remove=0; fi; \
-    if [ "$INSTALL_MODE" = "editable" ]; then should_remove=0; fi; \
-    case "$RUN_CONFIG_PATH" in \
-        /workspace*) should_remove=0 ;; \
-    esac; \
-    if [ "$should_remove" -eq 1 ] && [ -d /workspace ]; then rm -rf /workspace; fi
-
-RUN cat <<'EOF' >/usr/local/bin/llama-stack-entrypoint.sh
-#!/bin/sh
-set -e
-
-if [ -n "$RUN_CONFIG_PATH" ] && [ -f "$RUN_CONFIG_PATH" ]; then
-  exec llama stack run "$RUN_CONFIG_PATH" "$@"
-fi
-
-if [ -n "$DISTRO_NAME" ]; then
-  exec llama stack run "$DISTRO_NAME" "$@"
-fi
-
-exec llama stack run "$@"
-EOF
-RUN chmod +x /usr/local/bin/llama-stack-entrypoint.sh
-
-RUN mkdir -p /.llama /.cache && chmod -R g+rw /app /.llama /.cache
-
-ENTRYPOINT ["/usr/local/bin/llama-stack-entrypoint.sh"]
--- a/docs/docs/advanced_apis/post_training.mdx
+++ b/docs/docs/advanced_apis/post_training.mdx
@ -51,8 +51,8 @@ device: cpu
 You can access the HuggingFace trainer via the `starter` distribution:

 ```bash
-llama stack list-deps starter | xargs -L1 uv pip install
-llama stack run starter
+llama stack build --distro starter --image-type venv
+llama stack run --image-type venv ~/.llama/distributions/starter/starter-run.yaml
 ```

 ### Usage Example
--- a/docs/docs/building_applications/playground.mdx
+++ b/docs/docs/building_applications/playground.mdx
@ -175,7 +175,8 @@ llama-stack-client benchmarks register \
 **1. Start the Llama Stack API Server**

 ```bash
-llama stack list-deps together | xargs -L1 uv pip install
+# Build and run a distribution (example: together)
+llama stack build --distro together --image-type venv
 llama stack run together
 ```

@ -208,7 +209,7 @@ The playground works with any Llama Stack distribution. Popular options include:
 <TabItem value="together" label="Together AI">

 ```bash
-llama stack list-deps together | xargs -L1 uv pip install
+llama stack build --distro together --image-type venv
 llama stack run together
 ```

@ -221,7 +222,7 @@ llama stack run together
 <TabItem value="ollama" label="Ollama (Local)">

 ```bash
-llama stack list-deps ollama | xargs -L1 uv pip install
+llama stack build --distro ollama --image-type venv
 llama stack run ollama
 ```

@ -234,7 +235,7 @@ llama stack run ollama
 <TabItem value="meta-reference" label="Meta Reference">

 ```bash
-llama stack list-deps meta-reference | xargs -L1 uv pip install
+llama stack build --distro meta-reference --image-type venv
 llama stack run meta-reference
 ```

--- a/docs/docs/building_applications/rag.mdx
+++ b/docs/docs/building_applications/rag.mdx
@ -10,114 +10,358 @@ import TabItem from '@theme/TabItem';

 # Retrieval Augmented Generation (RAG)

-
-RAG enables your applications to reference and recall information from external documents. Llama Stack makes Agentic RAG available through OpenAI's Responses API.
-
-## Quick Start
-
-### 1. Start the Server
-
-In one terminal, start the Llama Stack server:
-
-```bash
-llama stack list-deps starter | xargs -L1 uv pip install
-llama stack run starter
-```
-
-### 2. Connect with OpenAI Client
-
-In another terminal, use the standard OpenAI client with the Responses API:
-
-```python
-import io, requests
-from openai import OpenAI
-
-url = "https://www.paulgraham.com/greatwork.html"
-client = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")
-
-# Create vector store - auto-detects default embedding model
-vs = client.vector_stores.create()
-
-response = requests.get(url)
-pseudo_file = io.BytesIO(str(response.content).encode('utf-8'))
-file_id = client.files.create(file=(url, pseudo_file, "text/html"), purpose="assistants").id
-client.vector_stores.files.create(vector_store_id=vs.id, file_id=file_id)
-
-resp = client.responses.create(
-    model="gpt-4o",
-    input="How do you do great work? Use the existing knowledge_search tool.",
-    tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
-    include=["file_search_call.results"],
-)
-
-print(resp.output[-1].content[-1].text)
-```
-Which should give output like:
-```
-Doing great work is about more than just hard work and ambition; it involves combining several elements:
-
-1. **Pursue What Excites You**: Engage in projects that are both ambitious and exciting to you. It's important to work on something you have a natural aptitude for and a deep interest in.
-
-2. **Explore and Discover**: Great work often feels like a blend of discovery and creation. Focus on seeing possibilities and let ideas take their natural shape, rather than just executing a plan.
-
-3. **Be Bold Yet Flexible**: Take bold steps in your work without over-planning. An adaptable approach that evolves with new ideas can often lead to breakthroughs.
-
-4. **Work on Your Own Projects**: Develop a habit of working on projects of your own choosing, as these often lead to great achievements. These should be projects you find exciting and that challenge you intellectually.
-
-5. **Be Earnest and Authentic**: Approach your work with earnestness and authenticity. Trying to impress others with affectation can be counterproductive, as genuine effort and intellectual honesty lead to better work outcomes.
-
-6. **Build a Supportive Environment**: Work alongside great colleagues who inspire you and enhance your work. Surrounding yourself with motivating individuals creates a fertile environment for great work.
-
-7. **Maintain High Morale**: High morale significantly impacts your ability to do great work. Stay optimistic and protect your mental well-being to maintain progress and momentum.
-
-8. **Balance**: While hard work is essential, overworking can lead to diminishing returns. Balance periods of intensive work with rest to sustain productivity over time.
-
-This approach shows that great work is less about following a strict formula and more about aligning your interests, ambition, and environment to foster creativity and innovation.
-```
+RAG enables your applications to reference and recall information from previous interactions or external documents.

 ## Architecture Overview

-Llama Stack provides OpenAI-compatible RAG capabilities through:
+Llama Stack organizes the APIs that enable RAG into three layers:

- **Vector Stores API**: OpenAI-compatible vector storage with automatic embedding model detection
- **Files API**: Document upload and processing using OpenAI's file format
- **Responses API**: Enhanced chat completions with agentic tool calling via file search
+1. **Lower-Level APIs**: Deal with raw storage and retrieval. These include Vector IO, KeyValue IO (coming soon) and Relational IO (also coming soon)
+2. **RAG Tool**: A first-class tool as part of the [Tools API](./tools) that allows you to ingest documents (from URLs, files, etc) with various chunking strategies and query them smartly
+3. **Agents API**: The top-level [Agents API](./agent) that allows you to create agents that can use the tools to answer questions, perform tasks, and more

-## Configuring Default Embedding Models
+![RAG System Architecture](/img/rag.png)

-To enable automatic vector store creation without specifying embedding models, configure a default embedding model in your run.yaml like so:
+The RAG system uses lower-level storage for different types of data:
+- **Vector IO**: For semantic search and retrieval
+- **Key-Value and Relational IO**: For structured data storage

-```yaml
-vector_stores:
-  default_provider_id: faiss
-  default_embedding_model:
-    provider_id: sentence-transformers
-    model_id: nomic-ai/nomic-embed-text-v1.5
-```
+:::info[Future Storage Types]
+We may add more storage types like Graph IO in the future.
+:::

-With this configuration:
- `client.vector_stores.create()` works without requiring embedding model or provider parameters
- The system automatically uses the default vector store provider (`faiss`) when multiple providers are available
- The system automatically uses the default embedding model (`sentence-transformers/nomic-ai/nomic-embed-text-v1.5`) for any newly created vector store
- The `default_provider_id` specifies which vector storage backend to use
- The `default_embedding_model` specifies both the inference provider and model for embeddings
+## Setting up Vector Databases

-## Vector Store Operations
+For this guide, we will use [Ollama](https://ollama.com/) as the inference provider. Ollama is an LLM runtime that allows you to run Llama models locally.

-### Creating Vector Stores
-
-You can create vector stores with automatic or explicit embedding model selection:
+Here's how to set up a vector database for RAG:

 ```python
-# Automatic - uses default configured embedding model and vector store provider
-vs = client.vector_stores.create()
+# Create HTTP client
+import os
+from llama_stack_client import LlamaStackClient

-# Explicit - specify embedding model and/or provider when you need specific ones
-vs = client.vector_stores.create(
-    extra_body={
-        "provider_id": "faiss",  # Optional: specify vector store provider
-        "embedding_model": "sentence-transformers/nomic-ai/nomic-embed-text-v1.5",
-        "embedding_dimension": 768  # Optional: will be auto-detected if not provided
-    }
+client = LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")
+
+# Register a vector database
+vector_db_id = "my_documents"
+response = client.vector_dbs.register(
+    vector_db_id=vector_db_id,
+    embedding_model="all-MiniLM-L6-v2",
+    embedding_dimension=384,
+    provider_id="faiss",
 )
 ```
+
+## Document Ingestion
+
+You can ingest documents into the vector database using two methods: directly inserting pre-chunked documents or using the RAG Tool.
+
+### Direct Document Insertion
+
+<Tabs>
+<TabItem value="basic" label="Basic Insertion">
+
+```python
+# You can insert a pre-chunked document directly into the vector db
+chunks = [
+    {
+        "content": "Your document text here",
+        "mime_type": "text/plain",
+        "metadata": {
+            "document_id": "doc1",
+            "author": "Jane Doe",
+        },
+    },
+]
+client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
+```
+
+</TabItem>
+<TabItem value="embeddings" label="With Precomputed Embeddings">
+
+If you decide to precompute embeddings for your documents, you can insert them directly into the vector database by including the embedding vectors in the chunk data. This is useful if you have a separate embedding service or if you want to customize the ingestion process.
+
+```python
+chunks_with_embeddings = [
+    {
+        "content": "First chunk of text",
+        "mime_type": "text/plain",
+        "embedding": [0.1, 0.2, 0.3, ...],  # Your precomputed embedding vector
+        "metadata": {"document_id": "doc1", "section": "introduction"},
+    },
+    {
+        "content": "Second chunk of text",
+        "mime_type": "text/plain",
+        "embedding": [0.2, 0.3, 0.4, ...],  # Your precomputed embedding vector
+        "metadata": {"document_id": "doc1", "section": "methodology"},
+    },
+]
+client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks_with_embeddings)
+```
+
+:::warning[Embedding Dimensions]
+When providing precomputed embeddings, ensure the embedding dimension matches the `embedding_dimension` specified when registering the vector database.
+:::
+
+</TabItem>
+</Tabs>
+
+### Document Retrieval
+
+You can query the vector database to retrieve documents based on their embeddings.
+
+```python
+# You can then query for these chunks
+chunks_response = client.vector_io.query(
+    vector_db_id=vector_db_id,
+    query="What do you know about..."
+)
+```
+
+## Using the RAG Tool
+
+:::danger[Deprecation Notice]
+The RAG Tool is being deprecated in favor of directly using the OpenAI-compatible Search API. We recommend migrating to the OpenAI APIs for better compatibility and future support.
+:::
+
+A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc. and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the [appendix](#more-ragdocument-examples).
+
+### OpenAI API Integration & Migration
+
+The RAG tool has been updated to use OpenAI-compatible APIs. This provides several benefits:
+
+- **Files API Integration**: Documents are now uploaded using OpenAI's file upload endpoints
+- **Vector Stores API**: Vector storage operations use OpenAI's vector store format with configurable chunking strategies
+- **Error Resilience**: When processing multiple documents, individual failures are logged but don't crash the operation. Failed documents are skipped while successful ones continue processing.
+
+### Migration Path
+
+We recommend migrating to the OpenAI-compatible Search API for:
+
+1. **Better OpenAI Ecosystem Integration**: Direct compatibility with OpenAI tools and workflows including the Responses API
+2. **Future-Proof**: Continued support and feature development
+3. **Full OpenAI Compatibility**: Vector Stores, Files, and Search APIs are fully compatible with OpenAI's Responses API
+
+The OpenAI APIs are used under the hood, so you can continue to use your existing RAG Tool code with minimal changes. However, we recommend updating your code to use the new OpenAI-compatible APIs for better long-term support. If any documents fail to process, they will be logged in the response but will not cause the entire operation to fail.
+
+### RAG Tool Example
+
+```python
+from llama_stack_client import RAGDocument
+
+urls = ["memory_optimizations.rst", "chat.rst", "llama3.rst"]
+documents = [
+    RAGDocument(
+        document_id=f"num-{i}",
+        content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
+        mime_type="text/plain",
+        metadata={},
+    )
+    for i, url in enumerate(urls)
+]
+
+client.tool_runtime.rag_tool.insert(
+    documents=documents,
+    vector_db_id=vector_db_id,
+    chunk_size_in_tokens=512,
+)
+
+# Query documents
+results = client.tool_runtime.rag_tool.query(
+    vector_db_ids=[vector_db_id],
+    content="What do you know about...",
+)
+```
+
+### Custom Context Configuration
+
+You can configure how the RAG tool adds metadata to the context if you find it useful for your application:
+
+```python
+# Query documents with custom template
+results = client.tool_runtime.rag_tool.query(
+    vector_db_ids=[vector_db_id],
+    content="What do you know about...",
+    query_config={
+        "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
+    },
+)
+```
+
+## Building RAG-Enhanced Agents
+
+One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
+
+### Agent with Knowledge Search
+
+```python
+from llama_stack_client import Agent
+
+# Create agent with memory
+agent = Agent(
+    client,
+    model="meta-llama/Llama-3.3-70B-Instruct",
+    instructions="You are a helpful assistant",
+    tools=[
+        {
+            "name": "builtin::rag/knowledge_search",
+            "args": {
+                "vector_db_ids": [vector_db_id],
+                # Defaults
+                "query_config": {
+                    "chunk_size_in_tokens": 512,
+                    "chunk_overlap_in_tokens": 0,
+                    "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
+                },
+            },
+        }
+    ],
+)
+session_id = agent.create_session("rag_session")
+
+# Ask questions about documents in the vector db, and the agent will query the db to answer the question.
+response = agent.create_turn(
+    messages=[{"role": "user", "content": "How to optimize memory in PyTorch?"}],
+    session_id=session_id,
+)
+```
+
+:::tip[Agent Instructions]
+The `instructions` field in the `AgentConfig` can be used to guide the agent's behavior. It is important to experiment with different instructions to see what works best for your use case.
+:::
+
+### Document-Aware Conversations
+
+You can also pass documents along with the user's message and ask questions about them:
+
+```python
+# Initial document ingestion
+response = agent.create_turn(
+    messages=[
+        {"role": "user", "content": "I am providing some documents for reference."}
+    ],
+    documents=[
+        {
+            "content": "https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/memory_optimizations.rst",
+            "mime_type": "text/plain",
+        }
+    ],
+    session_id=session_id,
+)
+
+# Query with RAG
+response = agent.create_turn(
+    messages=[{"role": "user", "content": "What are the key topics in the documents?"}],
+    session_id=session_id,
+)
+```
+
+### Viewing Agent Responses
+
+You can print the response with the following:
+
+```python
+from llama_stack_client import AgentEventLogger
+
+for log in AgentEventLogger().log(response):
+    log.print()
+```
+
+## Vector Database Management
+
+### Unregistering Vector DBs
+
+If you need to clean up and unregister vector databases, you can do so as follows:
+
+<Tabs>
+<TabItem value="single" label="Single Database">
+
+```python
+# Unregister a specified vector database
+vector_db_id = "my_vector_db_id"
+print(f"Unregistering vector database: {vector_db_id}")
+client.vector_dbs.unregister(vector_db_id=vector_db_id)
+```
+
+</TabItem>
+<TabItem value="all" label="All Databases">
+
+```python
+# Unregister all vector databases
+for vector_db_id in client.vector_dbs.list():
+    print(f"Unregistering vector database: {vector_db_id.identifier}")
+    client.vector_dbs.unregister(vector_db_id=vector_db_id.identifier)
+```
+
+</TabItem>
+</Tabs>
+
+## Best Practices
+
+### 🎯 **Document Chunking**
+- Use appropriate chunk sizes (512 tokens is often a good starting point)
+- Consider overlap between chunks for better context preservation
+- Experiment with different chunking strategies for your content type
+
+### 🔍 **Embedding Strategy**
+- Choose embedding models that match your domain
+- Consider the trade-off between embedding dimension and performance
+- Test different embedding models for your specific use case
+
+### 📊 **Query Optimization**
+- Use specific, well-formed queries for better retrieval
+- Experiment with different search strategies
+- Consider hybrid approaches (keyword + semantic search)
+
+### 🛡️ **Error Handling**
+- Implement proper error handling for failed document processing
+- Monitor ingestion success rates
+- Have fallback strategies for retrieval failures
+
+## Appendix
+
+### More RAGDocument Examples
+
+Here are various ways to create RAGDocument objects for different content types:
+
+```python
+from llama_stack_client import RAGDocument
+import base64
+
+# File URI
+RAGDocument(document_id="num-0", content={"uri": "file://path/to/file"})
+
+# Plain text
+RAGDocument(document_id="num-1", content="plain text")
+
+# Explicit text input
+RAGDocument(
+    document_id="num-2",
+    content={
+        "type": "text",
+        "text": "plain text input",
+    },  # for inputs that should be treated as text explicitly
+)
+
+# Image from URL
+RAGDocument(
+    document_id="num-3",
+    content={
+        "type": "image",
+        "image": {"url": {"uri": "https://mywebsite.com/image.jpg"}},
+    },
+)
+
+# Base64 encoded image
+B64_ENCODED_IMAGE = base64.b64encode(
+    requests.get(
+        "https://raw.githubusercontent.com/meta-llama/llama-stack/refs/heads/main/docs/_static/llama-stack.png"
+    ).content
+)
+RAGDocument(
+    document_id="num-4",
+    content={"type": "image", "image": {"data": B64_ENCODED_IMAGE}},
+)
+```
+For more strongly typed interaction use the typed dicts found [here](https://github.com/meta-llama/llama-stack-client-python/blob/38cd91c9e396f2be0bec1ee96a19771582ba6f17/src/llama_stack_client/types/shared_params/document.py).
--- a/docs/docs/building_applications/telemetry.mdx
+++ b/docs/docs/building_applications/telemetry.mdx
@ -10,8 +10,58 @@ import TabItem from '@theme/TabItem';

 # Telemetry

-The Llama Stack uses OpenTelemetry to provide comprehensive tracing, metrics, and logging capabilities.
+The Llama Stack telemetry system provides comprehensive tracing, metrics, and logging capabilities. It supports multiple sink types including OpenTelemetry, SQLite, and Console output for complete observability of your AI applications.

+## Event Types
+
+The telemetry system supports three main types of events:
+
+<Tabs>
+<TabItem value="unstructured" label="Unstructured Logs">
+
+Free-form log messages with severity levels for general application logging:
+
+```python
+unstructured_log_event = UnstructuredLogEvent(
+    message="This is a log message",
+    severity=LogSeverity.INFO
+)
+```
+
+</TabItem>
+<TabItem value="metrics" label="Metric Events">
+
+Numerical measurements with units for tracking performance and usage:
+
+```python
+metric_event = MetricEvent(
+    metric="my_metric",
+    value=10,
+    unit="count"
+)
+```
+
+</TabItem>
+<TabItem value="structured" label="Structured Logs">
+
+System events like span start/end that provide structured operation tracking:
+
+```python
+structured_log_event = SpanStartPayload(
+    name="my_span",
+    parent_span_id="parent_span_id"
+)
+```
+
+</TabItem>
+</Tabs>
+
+## Spans and Traces
+
+- **Spans**: Represent individual operations with timing information and hierarchical relationships
+- **Traces**: Collections of related spans that form a complete request flow across your application
+
+This hierarchical structure allows you to understand the complete execution path of requests through your Llama Stack application.

 ## Automatic Metrics Generation

@ -79,6 +129,21 @@ Send events to an OpenTelemetry Collector for integration with observability pla
 - Compatible with all OpenTelemetry collectors
 - Supports both traces and metrics

+</TabItem>
+<TabItem value="sqlite" label="SQLite">
+
+Store events in a local SQLite database for direct querying:
+
+**Use Cases:**
+- Local development and debugging
+- Custom analytics and reporting
+- Offline analysis of application behavior
+
+**Features:**
+- Direct SQL querying capabilities
+- Persistent local storage
+- No external dependencies
+
 </TabItem>
 <TabItem value="console" label="Console">

@ -109,8 +174,9 @@ telemetry:
    provider_type: inline::meta-reference
    config:
      service_name: "llama-stack-service"
-      sinks: ['console', 'otel_trace', 'otel_metric']
+      sinks: ['console', 'sqlite', 'otel_trace', 'otel_metric']
      otel_exporter_otlp_endpoint: "http://localhost:4318"
+      sqlite_db_path: "/path/to/telemetry.db"
 ```

 ### Environment Variables
@ -119,23 +185,23 @@ Configure telemetry behavior using environment variables:

 - **`OTEL_EXPORTER_OTLP_ENDPOINT`**: OpenTelemetry Collector endpoint (default: `http://localhost:4318`)
 - **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string)
- **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `[]`)
+- **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `console,sqlite`)

-### Quick Setup: Complete Telemetry Stack
+## Visualization with Jaeger

-Use the automated setup script to launch the complete telemetry stack (Jaeger, OpenTelemetry Collector, Prometheus, and Grafana):
+The `otel_trace` sink works with any service compatible with the OpenTelemetry collector. Traces and metrics use separate endpoints but can share the same collector.
+
+### Starting Jaeger
+
+Start a Jaeger instance with OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686:

 ```bash
-./scripts/telemetry/setup_telemetry.sh
+docker run --pull always --rm --name jaeger \
+  -p 16686:16686 -p 4318:4318 \
+  jaegertracing/jaeger:2.1.0
 ```

-This sets up:
- **Jaeger UI**: http://localhost:16686 (traces visualization)
- **Prometheus**: http://localhost:9090 (metrics)
- **Grafana**: http://localhost:3000 (dashboards with auto-configured data sources)
- **OTEL Collector**: http://localhost:4318 (OTLP endpoint)
-
-Once running, you can visualize traces by navigating to [Grafana](http://localhost:3000/) and login with login `admin` and password `admin`.
+Once running, you can visualize traces by navigating to [http://localhost:16686/](http://localhost:16686/).

 ## Querying Metrics

@ -182,10 +248,37 @@ Forward metrics to other observability systems:
 </TabItem>
 </Tabs>

+## SQLite Querying
+
+The `sqlite` sink allows you to query traces without an external system. This is particularly useful for development and custom analytics.
+
+### Example Queries
+
+```sql
+-- Query recent traces
+SELECT * FROM traces WHERE timestamp > datetime('now', '-1 hour');
+
+-- Analyze span durations
+SELECT name, AVG(duration_ms) as avg_duration
+FROM spans
+GROUP BY name
+ORDER BY avg_duration DESC;
+
+-- Find slow operations
+SELECT * FROM spans
+WHERE duration_ms > 1000
+ORDER BY duration_ms DESC;
+```
+
+:::tip[Advanced Analytics]
+Refer to the [Getting Started notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) for more examples on querying traces and spans programmatically.
+:::
+
 ## Best Practices

 ### 🔍 **Monitoring Strategy**
 - Use OpenTelemetry for production environments
+- Combine multiple sinks for development (console + SQLite)
 - Set up alerts on key metrics like token usage and error rates

 ### 📊 **Metrics Analysis**
@ -200,8 +293,45 @@ Forward metrics to other observability systems:

 ### 🔧 **Configuration Management**
 - Use environment variables for flexible deployment
+- Configure appropriate retention policies for SQLite
 - Ensure proper network access to OpenTelemetry collectors

+## Integration Examples
+
+### Basic Telemetry Setup
+
+```python
+from llama_stack_client import LlamaStackClient
+
+# Client with telemetry headers
+client = LlamaStackClient(
+    base_url="http://localhost:8000",
+    extra_headers={
+        "X-Telemetry-Service": "my-ai-app",
+        "X-Telemetry-Version": "1.0.0"
+    }
+)
+
+# All API calls will be automatically traced
+response = client.chat.completions.create(
+    model="meta-llama/Llama-3.2-3B-Instruct",
+    messages=[{"role": "user", "content": "Hello!"}]
+)
+```
+
+### Custom Telemetry Context
+
+```python
+# Add custom span attributes for better tracking
+with tracer.start_as_current_span("custom_operation") as span:
+    span.set_attribute("user_id", "user123")
+    span.set_attribute("operation_type", "chat_completion")
+
+    response = client.chat.completions.create(
+        model="meta-llama/Llama-3.2-3B-Instruct",
+        messages=[{"role": "user", "content": "Hello!"}]
+    )
+```

 ## Related Resources

--- a/docs/docs/building_applications/tools.mdx
+++ b/docs/docs/building_applications/tools.mdx
@ -219,10 +219,13 @@ group_tools = client.tools.list_tools(toolgroup_id="search_tools")
 <TabItem value="setup" label="Setup & Configuration">

 1. Start by registering a Tavily API key at [Tavily](https://tavily.com/).
-2. [Optional] Set the API key in your environment before starting the Llama Stack server
+2. [Optional] Provide the API key directly to the Llama Stack server
 ```bash
 export TAVILY_SEARCH_API_KEY="your key"
 ```
+```bash
+--env TAVILY_SEARCH_API_KEY=${TAVILY_SEARCH_API_KEY}
+```

 </TabItem>
 <TabItem value="implementation" label="Implementation">
@ -270,9 +273,9 @@ for log in EventLogger().log(response):
 <TabItem value="setup" label="Setup & Configuration">

 1. Start by registering for a WolframAlpha API key at [WolframAlpha Developer Portal](https://developer.wolframalpha.com/access).
-2. Provide the API key either by setting it in your environment before starting the Llama Stack server:
+2. Provide the API key either when starting the Llama Stack server:
    ```bash
-    export WOLFRAM_ALPHA_API_KEY="your key"
+    --env WOLFRAM_ALPHA_API_KEY=${WOLFRAM_ALPHA_API_KEY}
    ```
    or from the client side:
    ```python
--- a/docs/docs/concepts/apis/api_leveling.mdx
+++ b/docs/docs/concepts/apis/api_leveling.mdx
@ -62,10 +62,6 @@ The new `/v2` API must be introduced alongside the existing `/v1` API and run in

 When a `/v2` API is introduced, a clear and generous deprecation policy for the `/v1` API must be published simultaneously. This policy must outline the timeline for the eventual removal of the `/v1` API, giving users ample time to migrate.

-### Deprecated APIs
-
-Deprecated APIs are those that are no longer actively maintained or supported. Depreated APIs are marked with the flag `deprecated = True` in the OpenAPI spec. These APIs will be removed in a future release.
-
 ### API Stability vs. Provider Stability

 The leveling introduced in this document relates to the stability of the API and not specifically the providers within the API.
--- a/docs/docs/concepts/apis/external.mdx
+++ b/docs/docs/concepts/apis/external.mdx
@ -152,6 +152,7 @@ __all__ = ["WeatherAPI", "available_providers"]
 from typing import Protocol

 from llama_stack.providers.datatypes import (
+    AdapterSpec,
    Api,
    ProviderSpec,
    RemoteProviderSpec,
@ -165,10 +166,12 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.weather,
            provider_type="remote::kaze",
            config_class="llama_stack_provider_kaze.KazeProviderConfig",
-            adapter_type="kaze",
-            module="llama_stack_provider_kaze",
-            pip_packages=["llama_stack_provider_kaze"],
-            config_class="llama_stack_provider_kaze.KazeProviderConfig",
+            adapter=AdapterSpec(
+                adapter_type="kaze",
+                module="llama_stack_provider_kaze",
+                pip_packages=["llama_stack_provider_kaze"],
+                config_class="llama_stack_provider_kaze.KazeProviderConfig",
+            ),
        ),
    ]

@ -322,10 +325,11 @@ class WeatherKazeAdapter(WeatherProvider):

 ```yaml
 # ~/.llama/providers.d/remote/weather/kaze.yaml
-adapter_type: kaze
-pip_packages: ["llama_stack_provider_kaze"]
-config_class: llama_stack_provider_kaze.config.KazeProviderConfig
-module: llama_stack_provider_kaze
+adapter:
+  adapter_type: kaze
+  pip_packages: ["llama_stack_provider_kaze"]
+  config_class: llama_stack_provider_kaze.config.KazeProviderConfig
+  module: llama_stack_provider_kaze
 optional_api_dependencies: []
 ```

@ -357,7 +361,7 @@ server:
 8. Run the server:

 ```bash
-llama stack run ~/.llama/run-byoa.yaml
+python -m llama_stack.core.server.server --yaml-config ~/.llama/run-byoa.yaml
 ```

 9. Test the API:
--- a/docs/docs/contributing/index.mdx
+++ b/docs/docs/contributing/index.mdx
@ -158,16 +158,17 @@ under the LICENSE file in the root directory of this source tree.

 Some tips about common tasks you work on while contributing to Llama Stack:

-### Setup for development
+### Using `llama stack build`

+Building a stack image will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
+
+Example:
 ```bash
+cd work/
 git clone https://github.com/meta-llama/llama-stack.git
-cd llama-stack
-uv run llama stack list-deps <distro-name> | xargs -L1 uv pip install
-
-# (Optional) If you are developing the llama-stack-client-python package, you can add it as an editable package.
 git clone https://github.com/meta-llama/llama-stack-client-python.git
-uv add --editable ../llama-stack-client-python
+cd llama-stack
+LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --distro <...>
 ```

 ### Updating distribution configurations
--- a/docs/docs/contributing/new_api_provider.mdx
+++ b/docs/docs/contributing/new_api_provider.mdx
@ -67,7 +67,7 @@ def get_base_url(self) -> str:

 ## Testing the Provider

-Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, install its dependencies with `llama stack list-deps together | xargs -L1 uv pip install`.
+Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, you should install dependencies via `llama stack build --distro together`.

 ### 1. Integration Testing

@ -76,7 +76,7 @@ Integration tests are located in [tests/integration](https://github.com/meta-lla
 Consult [tests/integration/README.md](https://github.com/meta-llama/llama-stack/blob/main/tests/integration/README.md) for more details on how to run the tests.

 Note that each provider's `sample_run_config()` method (in the configuration class for that provider)
- typically references some environment variables for specifying API keys and the like. You can set these in the environment before running the test command.
+ typically references some environment variables for specifying API keys and the like. You can set these in the environment or pass these via the `--env` flag to the test command.


 ### 2. Unit Testing
--- a/docs/docs/contributing/testing/record-replay.mdx
+++ b/docs/docs/contributing/testing/record-replay.mdx
@ -68,9 +68,7 @@ recordings/
 Direct API calls with no recording or replay:

 ```python
-from llama_stack.testing.api_recorder import api_recording, APIRecordingMode
-
-with api_recording(mode=APIRecordingMode.LIVE):
+with inference_recording(mode=InferenceMode.LIVE):
    response = await client.chat.completions.create(...)
 ```

@ -81,7 +79,7 @@ Use for initial development and debugging against real APIs.
 Captures API interactions while passing through real responses:

 ```python
-with api_recording(mode=APIRecordingMode.RECORD, storage_dir="./recordings"):
+with inference_recording(mode=InferenceMode.RECORD, storage_dir="./recordings"):
    response = await client.chat.completions.create(...)
    # Real API call made, response captured AND returned
 ```
@ -98,7 +96,7 @@ The recording process:
 Returns stored responses instead of making API calls:

 ```python
-with api_recording(mode=APIRecordingMode.REPLAY, storage_dir="./recordings"):
+with inference_recording(mode=InferenceMode.REPLAY, storage_dir="./recordings"):
    response = await client.chat.completions.create(...)
    # No API call made, cached response returned instantly
 ```
--- a/docs/docs/deploying/kubernetes_deployment.mdx
+++ b/docs/docs/deploying/kubernetes_deployment.mdx
@ -170,7 +170,7 @@ spec:
      - name: llama-stack
        image: localhost/llama-stack-run-k8s:latest
        imagePullPolicy: IfNotPresent
-        command: ["llama", "stack", "run", "/app/config.yaml"]
+        command: ["python", "-m", "llama_stack.core.server.server", "--config", "/app/config.yaml"]
        ports:
          - containerPort: 5000
        volumeMounts:
--- a/docs/docs/distributions/building_distro.mdx
+++ b/docs/docs/distributions/building_distro.mdx
@ -5,80 +5,225 @@ sidebar_label: Build your own Distribution
 sidebar_position: 3
 ---

-This guide walks you through inspecting existing distributions, customising their configuration, and building runnable artefacts for your own deployment.
+This guide will walk you through the steps to get started with building a Llama Stack distribution from scratch with your choice of API providers.

-### Explore existing distributions

-All first-party distributions live under `llama_stack/distributions/`. Each directory contains:
+### Setting your log level

- `build.yaml` – the distribution specification (providers, additional dependencies, optional external provider directories).
- `run.yaml` – sample run configuration (when provided).
- Documentation fragments that power this site.
+In order to specify the proper logging level users can apply the following environment variable `LLAMA_STACK_LOGGING` with the following format:

-Browse that folder to understand available providers and copy a distribution to use as a starting point. When creating a new stack, duplicate an existing directory, rename it, and adjust the `build.yaml` file to match your requirements.
+`LLAMA_STACK_LOGGING=server=debug;core=info`
+
+Where each category in the following list:
+
+- all
+- core
+- server
+- router
+- inference
+- agents
+- safety
+- eval
+- tools
+- client
+
+Can be set to any of the following log levels:
+
+- debug
+- info
+- warning
+- error
+- critical
+
+The default global log level is `info`. `all` sets the log level for all components.
+
+A user can also set `LLAMA_STACK_LOG_FILE` which will pipe the logs to the specified path as well as to the terminal. An example would be: `export LLAMA_STACK_LOG_FILE=server.log`
+
+### Llama Stack Build
+
+In order to build your own distribution, we recommend you clone the `llama-stack` repository.
+
+
+```
+git clone git@github.com:meta-llama/llama-stack.git
+cd llama-stack
+pip install -e .
+```
+Use the CLI to build your distribution.
+The main points to consider are:
+1. **Image Type** - Do you want a venv environment or a Container (eg. Docker)
+2. **Template** - Do you want to use a template to build your distribution? or start from scratch ?
+3. **Config** - Do you want to use a pre-existing config file to build your distribution?
+
+```
+llama stack build -h
+usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--distro DISTRIBUTION] [--list-distros] [--image-type {container,venv}] [--image-name IMAGE_NAME] [--print-deps-only]
+                         [--run] [--providers PROVIDERS]
+
+Build a Llama stack container
+
+options:
+  -h, --help            show this help message and exit
+  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will be prompted to
+                        enter information interactively (default: None)
+  --template TEMPLATE   (deprecated) Name of the example template config to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default:
+                        None)
+  --distro DISTRIBUTION, --distribution DISTRIBUTION
+                        Name of the distribution to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default: None)
+  --list-distros, --list-distributions
+                        Show the available distributions for building a Llama Stack distribution (default: False)
+  --image-type {container,venv}
+                        Image Type to use for the build. If not specified, will use the image type from the template config. (default: None)
+  --image-name IMAGE_NAME
+                        [for image-type=container|venv] Name of the virtual environment to use for the build. If not specified, currently active environment will be used if found. (default:
+                        None)
+  --print-deps-only     Print the dependencies for the stack only, without building the stack (default: False)
+  --run                 Run the stack after building using the same image type, name, and other applicable arguments (default: False)
+  --providers PROVIDERS
+                        Build a config for a list of providers and only those providers. This list is formatted like: api1=provider1,api2=provider2. Where there can be multiple providers per
+                        API. (default: None)
+```
+
+After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.

 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

 <Tabs>
-<TabItem value="container" label="Building a container">
+<TabItem value="template" label="Building from a template">
+To build from alternative API providers, we provide distribution templates for users to get started building a distribution backed by different providers.

-Use the Containerfile at `containers/Containerfile`, which installs `llama-stack`, resolves distribution dependencies via `llama stack list-deps`, and sets the entrypoint to `llama stack run`.
-
-```bash
-docker build . \
-  -f containers/Containerfile \
-  --build-arg DISTRO_NAME=starter \
-  --tag llama-stack:starter
+The following command will allow you to see the available templates and their corresponding providers.
+```
+llama stack build --list-templates
 ```

-Handy build arguments:
+```
+------------------------------+-----------------------------------------------------------------------------+
+| Template Name                | Description                                                                 |
+------------------------------+-----------------------------------------------------------------------------+
+| watsonx                      | Use watsonx for running LLM inference                                       |
+------------------------------+-----------------------------------------------------------------------------+
+| vllm-gpu                     | Use a built-in vLLM engine for running LLM inference                        |
+------------------------------+-----------------------------------------------------------------------------+
+| together                     | Use Together.AI for running LLM inference                                   |
+------------------------------+-----------------------------------------------------------------------------+
+| tgi                          | Use (an external) TGI server for running LLM inference                      |
+------------------------------+-----------------------------------------------------------------------------+
+| starter                      | Quick start template for running Llama Stack with several popular providers |
+------------------------------+-----------------------------------------------------------------------------+
+| sambanova                    | Use SambaNova for running LLM inference and safety                          |
+------------------------------+-----------------------------------------------------------------------------+
+| remote-vllm                  | Use (an external) vLLM server for running LLM inference                     |
+------------------------------+-----------------------------------------------------------------------------+
+| postgres-demo                | Quick start template for running Llama Stack with several popular providers |
+------------------------------+-----------------------------------------------------------------------------+
+| passthrough                  | Use Passthrough hosted llama-stack endpoint for LLM inference               |
+------------------------------+-----------------------------------------------------------------------------+
+| open-benchmark               | Distribution for running open benchmarks                                    |
+------------------------------+-----------------------------------------------------------------------------+
+| ollama                       | Use (an external) Ollama server for running LLM inference                   |
+------------------------------+-----------------------------------------------------------------------------+
+| nvidia                       | Use NVIDIA NIM for running LLM inference, evaluation and safety             |
+------------------------------+-----------------------------------------------------------------------------+
+| meta-reference-gpu           | Use Meta Reference for running LLM inference                                |
+------------------------------+-----------------------------------------------------------------------------+
+| llama_api                    | Distribution for running e2e tests in CI                                    |
+------------------------------+-----------------------------------------------------------------------------+
+| hf-serverless                | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
+| hf-endpoint                  | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
+| groq                         | Use Groq for running LLM inference                                          |
+------------------------------+-----------------------------------------------------------------------------+
+| fireworks                    | Use Fireworks.AI for running LLM inference                                  |
+------------------------------+-----------------------------------------------------------------------------+
+| experimental-post-training   | Experimental template for post training                                     |
+------------------------------+-----------------------------------------------------------------------------+
+| dell                         | Dell's distribution of Llama Stack. TGI inference via Dell's custom         |
+|                              | container                                                                   |
+------------------------------+-----------------------------------------------------------------------------+
+| ci-tests                     | Distribution for running e2e tests in CI                                    |
+------------------------------+-----------------------------------------------------------------------------+
+| cerebras                     | Use Cerebras for running LLM inference                                      |
+------------------------------+-----------------------------------------------------------------------------+
+| bedrock                      | Use AWS Bedrock for running LLM inference and safety                        |
+------------------------------+-----------------------------------------------------------------------------+
+```

- `DISTRO_NAME` – distribution directory name (defaults to `starter`).
- `RUN_CONFIG_PATH` – absolute path inside the build context for a run config that should be baked into the image (e.g. `/workspace/run.yaml`).
- `INSTALL_MODE=editable` – install the repository copied into `/workspace` with `uv pip install -e`. Pair it with `--build-arg LLAMA_STACK_DIR=/workspace`.
- `LLAMA_STACK_CLIENT_DIR` – optional editable install of the Python client.
- `PYPI_VERSION` / `TEST_PYPI_VERSION` – pin specific releases when not using editable installs.
- `KEEP_WORKSPACE=1` – retain `/workspace` in the final image if you need to access additional files (such as sample configs or provider bundles).
+You may then pick a template to build your distribution with providers fitted to your liking.

-Make sure any custom `build.yaml`, run configs, or provider directories you reference are included in the Docker build context so the Containerfile can read them.
+For example, to build a distribution with TGI as the inference provider, you can run:
+```
+$ llama stack build --distro starter
+...
+You can now edit ~/.llama/distributions/llamastack-starter/starter-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-starter/starter-run.yaml`
+```

+```{tip}
+The generated `run.yaml` file is a starting point for your configuration. For comprehensive guidance on customizing it for your specific needs, infrastructure, and deployment scenarios, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
+```
 </TabItem>
-<TabItem value="external" label="Building with external providers">
+<TabItem value="scratch" label="Building from Scratch">

-External providers live outside the main repository but can be bundled by pointing `external_providers_dir` to a directory that contains your provider packages.
+If the provided templates do not fit your use case, you could start off with running `llama stack build` which will allow you to a interactively enter wizard where you will be prompted to enter build configurations.

-1. Copy providers into the build context, for example `cp -R path/to/providers providers.d`.
-2. Update `build.yaml` with the directory and provider entries.
-3. Adjust run configs to use the in-container path (usually `/.llama/providers.d`). Pass `--build-arg RUN_CONFIG_PATH=/workspace/run.yaml` if you want to bake the config.
+It would be best to start with a template and understand the structure of the config file and the various concepts ( APIS, providers, resources, etc.) before starting from scratch.
+```
+llama stack build

-Example `build.yaml` excerpt for a custom Ollama provider:
+> Enter a name for your Llama Stack (e.g. my-local-stack): my-stack
+> Enter the image type you want your Llama Stack to be built as (container or venv): venv
+
+Llama Stack is composed of several APIs working together. Let's select
+the provider types (implementations) you want to use for these APIs.
+
+Tip: use <TAB> to see options for the providers.
+
+> Enter provider for API inference: inline::meta-reference
+> Enter provider for API safety: inline::llama-guard
+> Enter provider for API agents: inline::meta-reference
+> Enter provider for API memory: inline::faiss
+> Enter provider for API datasetio: inline::meta-reference
+> Enter provider for API scoring: inline::meta-reference
+> Enter provider for API eval: inline::meta-reference
+> Enter provider for API telemetry: inline::meta-reference
+
+ > (Optional) Enter a short description for your Llama Stack:
+
+You can now edit ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml`
+```
+</TabItem>
+<TabItem value="config" label="Building from a pre-existing build config file">
+- In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
+
+- The config file will be of contents like the ones in `llama_stack/distributions/*build.yaml`.
+
+```
+llama stack build --config llama_stack/distributions/starter/build.yaml
+```
+</TabItem>
+<TabItem value="external" label="Building with External Providers">
+
+Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently or use community-provided providers.
+
+To build a distribution with external providers, you need to:
+
+1. Configure the `external_providers_dir` in your build configuration file:

 ```yaml
+# Example my-external-stack.yaml with external providers
+version: '2'
 distribution_spec:
+  description: Custom distro for CI tests
  providers:
    inference:
-      - remote::custom_ollama
-external_providers_dir: /workspace/providers.d
-```
-
-Inside `providers.d/custom_ollama/provider.py`, define `get_provider_spec()` so the CLI can discover dependencies:
-
-```python
-from llama_stack.providers.datatypes import ProviderSpec
-
-
-def get_provider_spec() -> ProviderSpec:
-    return ProviderSpec(
-        provider_type="remote::custom_ollama",
-        module="llama_stack_ollama_provider",
-        config_class="llama_stack_ollama_provider.config.OllamaImplConfig",
-        pip_packages=[
-            "ollama",
-            "aiohttp",
-            "llama-stack-provider-ollama",
-        ],
-    )
+    - remote::custom_ollama
+# Add more providers as needed
+image_type: container
+image_name: ci-test
+# Path to external provider implementations
+external_providers_dir: ~/.llama/providers.d
 ```

 Here's an example for a custom Ollama provider:
@ -87,9 +232,9 @@ Here's an example for a custom Ollama provider:
 adapter:
  adapter_type: custom_ollama
  pip_packages:
-    - ollama
-    - aiohttp
-    - llama-stack-provider-ollama  # This is the provider package
+  - ollama
+  - aiohttp
+  - llama-stack-provider-ollama # This is the provider package
  config_class: llama_stack_ollama_provider.config.OllamaImplConfig
  module: llama_stack_ollama_provider
 api_dependencies: []
@ -100,23 +245,54 @@ The `pip_packages` section lists the Python packages required by the provider, a
 provider package itself. The package must be available on PyPI or can be provided from a local
 directory or a git repository (git must be installed on the build environment).

-For deeper guidance, see the [External Providers documentation](../providers/external/).
+2. Build your distribution using the config file:

+```
+llama stack build --config my-external-stack.yaml
+```
+
+For more information on external providers, including directory structure, provider types, and implementation requirements, see the [External Providers documentation](../providers/external/).
 </TabItem>
-</Tabs>
+<TabItem value="container" label="Building Container">

-### Run your stack server
+:::tip Podman Alternative
+Podman is supported as an alternative to Docker. Set `CONTAINER_BINARY` to `podman` in your environment to use Podman.
+:::

-After building the image, launch it directly with Docker or Podman—the entrypoint calls `llama stack run` using the baked distribution or the bundled run config:
+To build a container image, you may start off from a template and use the `--image-type container` flag to specify `container` as the build image type.
+
+```
+llama stack build --distro starter --image-type container
+```
+
+```
+$ llama stack build --distro starter --image-type container
+...
+Containerfile created successfully in /tmp/tmp.viA3a3Rdsg/ContainerfileFROM python:3.10-slim
+...
+```
+
+You can now edit ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml and run `llama stack run ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml`
+```
+
+Now set some environment variables for the inference model ID and Llama Stack Port and create a local directory to mount into the container's file system.

 ```bash
+export INFERENCE_MODEL="llama3.2:3b"
+export LLAMA_STACK_PORT=8321
+mkdir -p ~/.llama
+```
+
+After this step is successful, you should be able to find the built container image and test it with the below Docker command:
+
+```
 docker run -d \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
-  -e INFERENCE_MODEL=$INFERENCE_MODEL \
-  -e OLLAMA_URL=http://host.docker.internal:11434 \
-  llama-stack:starter \
-  --port $LLAMA_STACK_PORT
+  localhost/distribution-ollama:dev \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env OLLAMA_URL=http://host.docker.internal:11434
 ```

 Here are the docker flags and their uses:
@ -129,20 +305,141 @@ Here are the docker flags and their uses:

 * `localhost/distribution-ollama:dev`: The name and tag of the container image to run

-* `-e INFERENCE_MODEL=$INFERENCE_MODEL`: Sets the INFERENCE_MODEL environment variable in the container
-
-* `-e OLLAMA_URL=http://host.docker.internal:11434`: Sets the OLLAMA_URL environment variable in the container
-
 * `--port $LLAMA_STACK_PORT`: Port number for the server to listen on

+* `--env INFERENCE_MODEL=$INFERENCE_MODEL`: Sets the model to use for inference
+
+* `--env OLLAMA_URL=http://host.docker.internal:11434`: Configures the URL for the Ollama service
+
+</TabItem>
+</Tabs>


-If you prepared a custom run config, mount it into the container and reference it explicitly:
+### Running your Stack server
+Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack build` step.

-```bash
-docker run \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v $(pwd)/run.yaml:/app/run.yaml \
-  llama-stack:starter \
-  /app/run.yaml
 ```
+llama stack run -h
+usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--env KEY=VALUE]
+                       [--image-type {venv}] [--enable-ui]
+                       [config | template]
+
+Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
+
+positional arguments:
+  config | template     Path to config file to use for the run or name of known template (`llama stack list` for a list). (default: None)
+
+options:
+  -h, --help            show this help message and exit
+  --port PORT           Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
+  --image-name IMAGE_NAME
+                        Name of the image to run. Defaults to the current environment (default: None)
+  --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: None)
+  --image-type {venv}
+                        Image Type used during the build. This should be venv. (default: None)
+  --enable-ui           Start the UI server (default: False)
+```
+
+**Note:** Container images built with `llama stack build --image-type container` cannot be run using `llama stack run`. Instead, they must be run directly using Docker or Podman commands as shown in the container building section above.
+
+```
+# Start using template name
+llama stack run tgi
+
+# Start using config file
+llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
+
+# Start using a venv
+llama stack run --image-type venv ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
+```
+
+```
+$ llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
+
+Serving API inspect
+ GET /health
+ GET /providers/list
+ GET /routes/list
+Serving API inference
+ POST /inference/chat_completion
+ POST /inference/completion
+ POST /inference/embeddings
+...
+Serving API agents
+ POST /agents/create
+ POST /agents/session/create
+ POST /agents/turn/create
+ POST /agents/delete
+ POST /agents/session/delete
+ POST /agents/session/get
+ POST /agents/step/get
+ POST /agents/turn/get
+
+Listening on ['::', '0.0.0.0']:8321
+INFO:     Started server process [2935911]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
+INFO:     2401:db00:35c:2d2b:face:0:c9:0:54678 - "GET /models/list HTTP/1.1" 200 OK
+```
+
+### Listing Distributions
+Using the list command, you can view all existing Llama Stack distributions, including stacks built from templates, from scratch, or using custom configuration files.
+
+```
+llama stack list -h
+usage: llama stack list [-h]
+
+list the build stacks
+
+options:
+  -h, --help  show this help message and exit
+```
+
+Example Usage
+
+```
+llama stack list
+```
+
+```
+------------------------------+-----------------------------------------------------------------+--------------+------------+
+| Stack Name                  | Path                                                            | Build Config | Run Config |
+------------------------------+-----------------------------------------------------------------------------+--------------+
+| together                    | ~/.llama/distributions/together                                 | Yes          | No         |
+------------------------------+-----------------------------------------------------------------------------+--------------+
+| bedrock                     | ~/.llama/distributions/bedrock                                  | Yes          | No         |
+------------------------------+-----------------------------------------------------------------------------+--------------+
+| starter                     | ~/.llama/distributions/starter                                  | Yes          | Yes        |
+------------------------------+-----------------------------------------------------------------------------+--------------+
+| remote-vllm                 | ~/.llama/distributions/remote-vllm                              | Yes          | Yes        |
+------------------------------+-----------------------------------------------------------------------------+--------------+
+```
+
+### Removing a Distribution
+Use the remove command to delete a distribution you've previously built.
+
+```
+llama stack rm -h
+usage: llama stack rm [-h] [--all] [name]
+
+Remove the build stack
+
+positional arguments:
+  name        Name of the stack to delete (default: None)
+
+options:
+  -h, --help  show this help message and exit
+  --all, -a   Delete all stacks (use with caution) (default: False)
+```
+
+Example
+```
+llama stack rm llamastack-test
+```
+
+To keep your environment organized and avoid clutter, consider using `llama stack list` to review old or unused distributions and `llama stack rm <name>` to delete them when they're no longer needed.
+
+### Troubleshooting
+
+If you encounter any issues, ask questions in our discord or search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue.
--- a/docs/docs/distributions/configuration.mdx
+++ b/docs/docs/distributions/configuration.mdx
@ -44,32 +44,18 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      persistence:
-        agent_state:
-          backend: kv_default
-          namespace: agents
-        responses:
-          backend: sql_default
-          table_name: responses
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/agents_store.db
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
-storage:
-  backends:
-    kv_default:
-      type: kv_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/kvstore.db
-    sql_default:
-      type: sql_sqlite
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/sqlstore.db
-  references:
-    metadata:
-      backend: kv_default
-      namespace: registry
-    inference:
-      backend: sql_default
-      table_name: inference_store
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/registry.db
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
@ -115,7 +101,7 @@ A few things to note:
 - The id is a string you can choose freely.
 - You can instantiate any number of provider instances of the same type.
 - The configuration dictionary is provider-specific.
- Notice that configuration can reference environment variables (with default values), which are expanded at runtime. When you run a stack server, you can set environment variables in your shell before running `llama stack run` to override the default values.
+- Notice that configuration can reference environment variables (with default values), which are expanded at runtime. When you run a stack server (via docker or via `llama stack run`), you can specify `--env OLLAMA_URL=http://my-server:11434` to override the default value.

 ### Environment Variable Substitution

@ -187,10 +173,13 @@ optional_token: ${env.OPTIONAL_TOKEN:+}

 #### Runtime Override

-You can override environment variables at runtime by setting them in your shell before starting the server:
+You can override environment variables at runtime when starting the server:

 ```bash
-# Set environment variables in your shell
+# Override specific environment variables
+llama stack run --config run.yaml --env API_KEY=sk-123 --env BASE_URL=https://custom-api.com
+
+# Or set them in your shell
 export API_KEY=sk-123
 export BASE_URL=https://custom-api.com
 llama stack run --config run.yaml
@ -520,16 +509,16 @@ server:
    provider_config:
      type: "github_token"
      github_api_base_url: "https://api.github.com"
-    access_policy:
-    - permit:
-        principal: user-1
-        actions: [create, read, delete]
-      description: user-1 has full access to all resources
-    - permit:
-        principal: user-2
-        actions: [read]
-        resource: model::model-1
-      description: user-2 has read access to model-1 only
+  access_policy:
+  - permit:
+      principal: user-1
+      actions: [create, read, delete]
+    description: user-1 has full access to all resources
+  - permit:
+      principal: user-2
+      actions: [read]
+      resource: model::model-1
+    description: user-2 has read access to model-1 only
 ```

 Similarly, the following restricts access to particular kubernetes
--- a/docs/docs/distributions/importing_as_library.mdx
+++ b/docs/docs/distributions/importing_as_library.mdx
@ -12,7 +12,7 @@ This avoids the overhead of setting up a server.
 ```bash
 # setup
 uv pip install llama-stack
-llama stack list-deps starter | xargs -L1 uv pip install
+llama stack build --distro starter --image-type venv
 ```

 ```python
--- a/docs/docs/distributions/k8s/stack-configmap.yaml
+++ b/docs/docs/distributions/k8s/stack-configmap.yaml
@ -1,155 +1,56 @@
 apiVersion: v1
 data:
-  stack_run_config.yaml: |
-    version: '2'
-    image_name: kubernetes-demo
-    apis:
-    - agents
-    - inference
-    - files
-    - safety
-    - telemetry
-    - tool_runtime
-    - vector_io
-    providers:
-      inference:
-      - provider_id: vllm-inference
-        provider_type: remote::vllm
-        config:
-          url: ${env.VLLM_URL:=http://localhost:8000/v1}
-          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-          api_token: ${env.VLLM_API_TOKEN:=fake}
-          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-      - provider_id: vllm-safety
-        provider_type: remote::vllm
-        config:
-          url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
-          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-          api_token: ${env.VLLM_API_TOKEN:=fake}
-          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-      - provider_id: sentence-transformers
-        provider_type: inline::sentence-transformers
-        config: {}
-      vector_io:
-      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
-        provider_type: remote::chromadb
-        config:
-          url: ${env.CHROMADB_URL:=}
-          kvstore:
-            type: postgres
-            host: ${env.POSTGRES_HOST:=localhost}
-            port: ${env.POSTGRES_PORT:=5432}
-            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:=llamastack}
-            password: ${env.POSTGRES_PASSWORD:=llamastack}
-      files:
-      - provider_id: meta-reference-files
-        provider_type: inline::localfs
-        config:
-          storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
-          metadata_store:
-            type: sqlite
-            db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
-      safety:
-      - provider_id: llama-guard
-        provider_type: inline::llama-guard
-        config:
-          excluded_categories: []
-      agents:
-      - provider_id: meta-reference
-        provider_type: inline::meta-reference
-        config:
-          persistence_store:
-            type: postgres
-            host: ${env.POSTGRES_HOST:=localhost}
-            port: ${env.POSTGRES_PORT:=5432}
-            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:=llamastack}
-            password: ${env.POSTGRES_PASSWORD:=llamastack}
-          responses_store:
-            type: postgres
-            host: ${env.POSTGRES_HOST:=localhost}
-            port: ${env.POSTGRES_PORT:=5432}
-            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:=llamastack}
-            password: ${env.POSTGRES_PASSWORD:=llamastack}
-      telemetry:
-      - provider_id: meta-reference
-        provider_type: inline::meta-reference
-        config:
-          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-          sinks: ${env.TELEMETRY_SINKS:=console}
-      tool_runtime:
-      - provider_id: brave-search
-        provider_type: remote::brave-search
-        config:
-          api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-          max_results: 3
-      - provider_id: tavily-search
-        provider_type: remote::tavily-search
-        config:
-          api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-          max_results: 3
-      - provider_id: rag-runtime
-        provider_type: inline::rag-runtime
-        config: {}
-      - provider_id: model-context-protocol
-        provider_type: remote::model-context-protocol
-        config: {}
-    storage:
-      backends:
-        kv_default:
-          type: kv_postgres
-          host: ${env.POSTGRES_HOST:=localhost}
-          port: ${env.POSTGRES_PORT:=5432}
-          db: ${env.POSTGRES_DB:=llamastack}
-          user: ${env.POSTGRES_USER:=llamastack}
-          password: ${env.POSTGRES_PASSWORD:=llamastack}
-          table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
-        sql_default:
-          type: sql_postgres
-          host: ${env.POSTGRES_HOST:=localhost}
-          port: ${env.POSTGRES_PORT:=5432}
-          db: ${env.POSTGRES_DB:=llamastack}
-          user: ${env.POSTGRES_USER:=llamastack}
-          password: ${env.POSTGRES_PASSWORD:=llamastack}
-      references:
-        metadata:
-          backend: kv_default
-          namespace: registry
-        inference:
-          backend: sql_default
-          table_name: inference_store
-    models:
-    - metadata:
-        embedding_dimension: 768
-      model_id: nomic-embed-text-v1.5
-      provider_id: sentence-transformers
-      model_type: embedding
-    - metadata: {}
-      model_id: ${env.INFERENCE_MODEL}
-      provider_id: vllm-inference
-      model_type: llm
-    - metadata: {}
-      model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-      provider_id: vllm-safety
-      model_type: llm
-    shields:
-    - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-    vector_dbs: []
-    datasets: []
-    scoring_fns: []
-    benchmarks: []
-    tool_groups:
-    - toolgroup_id: builtin::websearch
-      provider_id: tavily-search
-    - toolgroup_id: builtin::rag
-      provider_id: rag-runtime
-    server:
-      port: 8321
-      auth:
-        provider_config:
-          type: github_token
+  stack_run_config.yaml: "version: '2'\nimage_name: kubernetes-demo\napis:\n- agents\n-
+    inference\n- files\n- safety\n- telemetry\n- tool_runtime\n- vector_io\nproviders:\n
+    \ inference:\n  - provider_id: vllm-inference\n    provider_type: remote::vllm\n
+    \   config:\n      url: ${env.VLLM_URL:=http://localhost:8000/v1}\n      max_tokens:
+    ${env.VLLM_MAX_TOKENS:=4096}\n      api_token: ${env.VLLM_API_TOKEN:=fake}\n      tls_verify:
+    ${env.VLLM_TLS_VERIFY:=true}\n  - provider_id: vllm-safety\n    provider_type:
+    remote::vllm\n    config:\n      url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}\n
+    \     max_tokens: ${env.VLLM_MAX_TOKENS:=4096}\n      api_token: ${env.VLLM_API_TOKEN:=fake}\n
+    \     tls_verify: ${env.VLLM_TLS_VERIFY:=true}\n  - provider_id: sentence-transformers\n
+    \   provider_type: inline::sentence-transformers\n    config: {}\n  vector_io:\n
+    \ - provider_id: ${env.ENABLE_CHROMADB:+chromadb}\n    provider_type: remote::chromadb\n
+    \   config:\n      url: ${env.CHROMADB_URL:=}\n      kvstore:\n        type: postgres\n
+    \       host: ${env.POSTGRES_HOST:=localhost}\n        port: ${env.POSTGRES_PORT:=5432}\n
+    \       db: ${env.POSTGRES_DB:=llamastack}\n        user: ${env.POSTGRES_USER:=llamastack}\n
+    \       password: ${env.POSTGRES_PASSWORD:=llamastack}\n  files:\n  - provider_id:
+    meta-reference-files\n    provider_type: inline::localfs\n    config:\n      storage_dir:
+    ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}\n      metadata_store:\n
+    \       type: sqlite\n        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+    \ \n  safety:\n  - provider_id: llama-guard\n    provider_type: inline::llama-guard\n
+    \   config:\n      excluded_categories: []\n  agents:\n  - provider_id: meta-reference\n
+    \   provider_type: inline::meta-reference\n    config:\n      persistence_store:\n
+    \       type: postgres\n        host: ${env.POSTGRES_HOST:=localhost}\n        port:
+    ${env.POSTGRES_PORT:=5432}\n        db: ${env.POSTGRES_DB:=llamastack}\n        user:
+    ${env.POSTGRES_USER:=llamastack}\n        password: ${env.POSTGRES_PASSWORD:=llamastack}\n
+    \     responses_store:\n        type: postgres\n        host: ${env.POSTGRES_HOST:=localhost}\n
+    \       port: ${env.POSTGRES_PORT:=5432}\n        db: ${env.POSTGRES_DB:=llamastack}\n
+    \       user: ${env.POSTGRES_USER:=llamastack}\n        password: ${env.POSTGRES_PASSWORD:=llamastack}\n
+    \ telemetry:\n  - provider_id: meta-reference\n    provider_type: inline::meta-reference\n
+    \   config:\n      service_name: \"${env.OTEL_SERVICE_NAME:=\\u200B}\"\n      sinks:
+    ${env.TELEMETRY_SINKS:=console}\n  tool_runtime:\n  - provider_id: brave-search\n
+    \   provider_type: remote::brave-search\n    config:\n      api_key: ${env.BRAVE_SEARCH_API_KEY:+}\n
+    \     max_results: 3\n  - provider_id: tavily-search\n    provider_type: remote::tavily-search\n
+    \   config:\n      api_key: ${env.TAVILY_SEARCH_API_KEY:+}\n      max_results:
+    3\n  - provider_id: rag-runtime\n    provider_type: inline::rag-runtime\n    config:
+    {}\n  - provider_id: model-context-protocol\n    provider_type: remote::model-context-protocol\n
+    \   config: {}\nmetadata_store:\n  type: postgres\n  host: ${env.POSTGRES_HOST:=localhost}\n
+    \ port: ${env.POSTGRES_PORT:=5432}\n  db: ${env.POSTGRES_DB:=llamastack}\n  user:
+    ${env.POSTGRES_USER:=llamastack}\n  password: ${env.POSTGRES_PASSWORD:=llamastack}\n
+    \ table_name: llamastack_kvstore\ninference_store:\n  type: postgres\n  host:
+    ${env.POSTGRES_HOST:=localhost}\n  port: ${env.POSTGRES_PORT:=5432}\n  db: ${env.POSTGRES_DB:=llamastack}\n
+    \ user: ${env.POSTGRES_USER:=llamastack}\n  password: ${env.POSTGRES_PASSWORD:=llamastack}\nmodels:\n-
+    metadata:\n    embedding_dimension: 384\n  model_id: all-MiniLM-L6-v2\n  provider_id:
+    sentence-transformers\n  model_type: embedding\n- metadata: {}\n  model_id: ${env.INFERENCE_MODEL}\n
+    \ provider_id: vllm-inference\n  model_type: llm\n- metadata: {}\n  model_id:
+    ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\n  provider_id: vllm-safety\n
+    \ model_type: llm\nshields:\n- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\nvector_dbs:
+    []\ndatasets: []\nscoring_fns: []\nbenchmarks: []\ntool_groups:\n- toolgroup_id:
+    builtin::websearch\n  provider_id: tavily-search\n- toolgroup_id: builtin::rag\n
+    \ provider_id: rag-runtime\nserver:\n  port: 8321\n  auth:\n    provider_config:\n
+    \     type: github_token\n"
 kind: ConfigMap
 metadata:
+  creationTimestamp: null
  name: llama-stack-config
--- a/docs/docs/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/docs/distributions/k8s/stack-k8s.yaml.template
@ -52,7 +52,7 @@ spec:
          value: "${SAFETY_MODEL}"
        - name: TAVILY_SEARCH_API_KEY
          value: "${TAVILY_SEARCH_API_KEY}"
-        command: ["llama", "stack", "run", "/etc/config/stack_run_config.yaml", "--port", "8321"]
+        command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8321"]
        ports:
          - containerPort: 8321
        volumeMounts:
--- a/docs/docs/distributions/k8s/stack_run_config.yaml
+++ b/docs/docs/distributions/k8s/stack_run_config.yaml
@ -93,34 +93,25 @@ providers:
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
    config: {}
-storage:
-  backends:
-    kv_default:
-      type: kv_postgres
-      host: ${env.POSTGRES_HOST:=localhost}
-      port: ${env.POSTGRES_PORT:=5432}
-      db: ${env.POSTGRES_DB:=llamastack}
-      user: ${env.POSTGRES_USER:=llamastack}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
-      table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
-    sql_default:
-      type: sql_postgres
-      host: ${env.POSTGRES_HOST:=localhost}
-      port: ${env.POSTGRES_PORT:=5432}
-      db: ${env.POSTGRES_DB:=llamastack}
-      user: ${env.POSTGRES_USER:=llamastack}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
-  references:
-    metadata:
-      backend: kv_default
-      namespace: registry
-    inference:
-      backend: sql_default
-      table_name: inference_store
+metadata_store:
+  type: postgres
+  host: ${env.POSTGRES_HOST:=localhost}
+  port: ${env.POSTGRES_PORT:=5432}
+  db: ${env.POSTGRES_DB:=llamastack}
+  user: ${env.POSTGRES_USER:=llamastack}
+  password: ${env.POSTGRES_PASSWORD:=llamastack}
+  table_name: llamastack_kvstore
+inference_store:
+  type: postgres
+  host: ${env.POSTGRES_HOST:=localhost}
+  port: ${env.POSTGRES_PORT:=5432}
+  db: ${env.POSTGRES_DB:=llamastack}
+  user: ${env.POSTGRES_USER:=llamastack}
+  password: ${env.POSTGRES_PASSWORD:=llamastack}
 models:
 - metadata:
-    embedding_dimension: 768
-  model_id: nomic-embed-text-v1.5
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
  provider_id: sentence-transformers
  model_type: embedding
 - metadata: {}
--- a/docs/docs/distributions/ondevice_distro/android_sdk.md
+++ b/docs/docs/distributions/ondevice_distro/android_sdk.md
@ -59,7 +59,7 @@ Start a Llama Stack server on localhost. Here is an example of how you can do th
 uv venv starter --python 3.12
 source starter/bin/activate  # On Windows: starter\Scripts\activate
 pip install --no-cache llama-stack==0.2.2
-llama stack list-deps starter | xargs -L1 uv pip install
+llama stack build --distro starter --image-type venv
 export FIREWORKS_API_KEY=<SOME_KEY>
 llama stack run starter --port 5050
 ```
--- a/docs/docs/distributions/remote_hosted_distro/watsonx.md
+++ b/docs/docs/distributions/remote_hosted_distro/watsonx.md
@ -69,10 +69,10 @@ docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
-  -e WATSONX_API_KEY=$WATSONX_API_KEY \
-  -e WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
-  -e WATSONX_BASE_URL=$WATSONX_BASE_URL \
  llamastack/distribution-watsonx \
  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT
+  --port $LLAMA_STACK_PORT \
+  --env WATSONX_API_KEY=$WATSONX_API_KEY \
+  --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
+  --env WATSONX_BASE_URL=$WATSONX_BASE_URL
 ```
--- a/docs/docs/distributions/self_hosted_distro/dell.md
+++ b/docs/docs/distributions/self_hosted_distro/dell.md
@ -129,11 +129,11 @@ docker run -it \
  # NOTE: mount the llama-stack / llama-model directories if testing local changes else not needed
  -v $HOME/git/llama-stack:/app/llama-stack-source -v $HOME/git/llama-models:/app/llama-models-source \
  # localhost/distribution-dell:dev if building / testing locally
-  -e INFERENCE_MODEL=$INFERENCE_MODEL \
-  -e DEH_URL=$DEH_URL \
-  -e CHROMA_URL=$CHROMA_URL \
-  llamastack/distribution-dell \
-  --port $LLAMA_STACK_PORT
+  llamastack/distribution-dell\
+  --port $LLAMA_STACK_PORT  \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env DEH_URL=$DEH_URL \
+  --env CHROMA_URL=$CHROMA_URL

 ```

@ -154,37 +154,37 @@ docker run \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v $HOME/.llama:/root/.llama \
  -v ./llama_stack/distributions/tgi/run-with-safety.yaml:/root/my-run.yaml \
-  -e INFERENCE_MODEL=$INFERENCE_MODEL \
-  -e DEH_URL=$DEH_URL \
-  -e SAFETY_MODEL=$SAFETY_MODEL \
-  -e DEH_SAFETY_URL=$DEH_SAFETY_URL \
-  -e CHROMA_URL=$CHROMA_URL \
  llamastack/distribution-dell \
  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env DEH_URL=$DEH_URL \
+  --env SAFETY_MODEL=$SAFETY_MODEL \
+  --env DEH_SAFETY_URL=$DEH_SAFETY_URL \
+  --env CHROMA_URL=$CHROMA_URL
 ```

 ### Via venv

-Install the distribution dependencies before launching:
+Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.

 ```bash
-llama stack list-deps dell | xargs -L1 uv pip install
-INFERENCE_MODEL=$INFERENCE_MODEL \
-DEH_URL=$DEH_URL \
-CHROMA_URL=$CHROMA_URL \
-llama stack run dell \
-  --port $LLAMA_STACK_PORT
+llama stack build --distro dell --image-type venv
+llama stack run dell
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env DEH_URL=$DEH_URL \
+  --env CHROMA_URL=$CHROMA_URL
 ```

 If you are using Llama Stack Safety / Shield APIs, use:

 ```bash
-INFERENCE_MODEL=$INFERENCE_MODEL \
-DEH_URL=$DEH_URL \
-SAFETY_MODEL=$SAFETY_MODEL \
-DEH_SAFETY_URL=$DEH_SAFETY_URL \
-CHROMA_URL=$CHROMA_URL \
 llama stack run ./run-with-safety.yaml \
-  --port $LLAMA_STACK_PORT
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env DEH_URL=$DEH_URL \
+  --env SAFETY_MODEL=$SAFETY_MODEL \
+  --env DEH_SAFETY_URL=$DEH_SAFETY_URL \
+  --env CHROMA_URL=$CHROMA_URL
 ```
--- a/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
@ -21,6 +21,7 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo
 | inference | `inline::meta-reference` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
+| telemetry | `inline::meta-reference` |
 | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |

@ -40,7 +41,31 @@ The following environment variables can be configured:

 ## Prerequisite: Downloading Models

-Please check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models using the Hugging Face CLI.
+Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
+
+```
+$ llama model list --downloaded
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
+┃ Model                                   ┃ Size     ┃ Modified Time       ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
+│ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
+└─────────────────────────────────────────┴──────────┴─────────────────────┘
 ```

 ## Running the Distribution
@ -59,9 +84,9 @@ docker run \
  --gpu all \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
-  -e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
  llamastack/distribution-meta-reference-gpu \
-  --port $LLAMA_STACK_PORT
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```

 If you are using Llama Stack Safety / Shield APIs, use:
@ -73,28 +98,28 @@ docker run \
  --gpu all \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
-  -e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-  -e SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
  llamastack/distribution-meta-reference-gpu \
-  --port $LLAMA_STACK_PORT
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 ```

 ### Via venv

-Make sure you have the Llama Stack CLI available.
+Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.

 ```bash
-llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
-INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+llama stack build --distro meta-reference-gpu --image-type venv
 llama stack run distributions/meta-reference-gpu/run.yaml \
-  --port 8321
+  --port 8321 \
+  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```

 If you are using Llama Stack Safety / Shield APIs, use:

 ```bash
-INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
 llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \
-  --port 8321
+  --port 8321 \
+  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 ```
--- a/docs/docs/distributions/self_hosted_distro/nvidia.md
+++ b/docs/docs/distributions/self_hosted_distro/nvidia.md
@ -16,6 +16,7 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 | post_training | `remote::nvidia` |
 | safety | `remote::nvidia` |
 | scoring | `inline::basic` |
+| telemetry | `inline::meta-reference` |
 | tool_runtime | `inline::rag-runtime` |
 | vector_io | `inline::faiss` |

@ -128,23 +129,23 @@ docker run \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
-  -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
  llamastack/distribution-nvidia \
  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT
+  --port $LLAMA_STACK_PORT \
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
 ```

 ### Via venv

-If you've set up your local development environment, you can also install the distribution dependencies using your local virtual environment.
+If you've set up your local development environment, you can also build the image using your local virtual environment.

 ```bash
 INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
-llama stack list-deps nvidia | xargs -L1 uv pip install
-NVIDIA_API_KEY=$NVIDIA_API_KEY \
-INFERENCE_MODEL=$INFERENCE_MODEL \
+llama stack build --distro nvidia --image-type venv
 llama stack run ./run.yaml \
-  --port 8321
+  --port 8321 \
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL
 ```

 ## Example Notebooks
--- a/docs/docs/distributions/self_hosted_distro/starter.md
+++ b/docs/docs/distributions/self_hosted_distro/starter.md
@ -119,7 +119,7 @@ The following environment variables can be configured:

 ### Telemetry Configuration
 - `OTEL_SERVICE_NAME`: OpenTelemetry service name
- `TELEMETRY_SINKS`: Telemetry sinks (default: `[]`)
+- `TELEMETRY_SINKS`: Telemetry sinks (default: `console,sqlite`)

 ## Enabling Providers

@ -169,11 +169,7 @@ docker run \
 Ensure you have configured the starter distribution using the environment variables explained above.

 ```bash
-# Install dependencies for the starter distribution
-uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
-
-# Run the server
-uv run --with llama-stack llama stack run starter
+uv run --with llama-stack llama stack build --distro starter --image-type venv --run
 ```

 ## Example Usage
@ -220,6 +216,7 @@ The starter distribution uses SQLite for local storage of various components:
 - **Files metadata**: `~/.llama/distributions/starter/files_metadata.db`
 - **Agents store**: `~/.llama/distributions/starter/agents_store.db`
 - **Responses store**: `~/.llama/distributions/starter/responses_store.db`
+- **Trace store**: `~/.llama/distributions/starter/trace_store.db`
 - **Evaluation store**: `~/.llama/distributions/starter/meta_reference_eval.db`
 - **Dataset I/O stores**: Various HuggingFace and local filesystem stores

--- a/docs/docs/distributions/starting_llama_stack_server.mdx
+++ b/docs/docs/distributions/starting_llama_stack_server.mdx
@ -23,17 +23,6 @@ Another simple way to start interacting with Llama Stack is to just spin up a co
 If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](../deploying/kubernetes_deployment) for more details.


-## Configure logging
-
-Control log output via environment variables before starting the server.
-
- `LLAMA_STACK_LOGGING` sets per-component levels, e.g. `LLAMA_STACK_LOGGING=server=debug;core=info`.
- Supported categories: `all`, `core`, `server`, `router`, `inference`, `agents`, `safety`, `eval`, `tools`, `client`.
- Levels: `debug`, `info`, `warning`, `error`, `critical` (default is `info`). Use `all=<level>` to apply globally.
- `LLAMA_STACK_LOG_FILE=/path/to/log` mirrors logs to a file while still printing to stdout.
-
-Export these variables prior to running `llama stack run`, launching a container, or starting the server through any other pathway.
-
 ```{toctree}
 :maxdepth: 1
 :hidden:
--- a/docs/docs/getting_started/demo_script.py
+++ b/docs/docs/getting_started/demo_script.py
@ -4,24 +4,65 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient

-import io, requests
-from openai import OpenAI
+vector_db_id = "my_demo_vector_db"
+client = LlamaStackClient(base_url="http://localhost:8321")

-url="https://www.paulgraham.com/greatwork.html"
-client = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")
+models = client.models.list()

-vs = client.vector_stores.create()
-response = requests.get(url)
-pseudo_file = io.BytesIO(str(response.content).encode('utf-8'))
-uploaded_file = client.files.create(file=(url, pseudo_file, "text/html"), purpose="assistants")
-client.vector_stores.files.create(vector_store_id=vs.id, file_id=uploaded_file.id)
+# Select the first LLM and first embedding models
+model_id = next(m for m in models if m.model_type == "llm").identifier
+embedding_model_id = (
+    em := next(m for m in models if m.model_type == "embedding")
+).identifier
+embedding_dimension = em.metadata["embedding_dimension"]

-resp = client.responses.create(
-    model="openai/gpt-4o",
-    input="How do you do great work? Use the existing knowledge_search tool.",
-    tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
-    include=["file_search_call.results"],
+vector_db = client.vector_dbs.register(
+    vector_db_id=vector_db_id,
+    embedding_model=embedding_model_id,
+    embedding_dimension=embedding_dimension,
+    provider_id="faiss",
+)
+vector_db_id = vector_db.identifier
+source = "https://www.paulgraham.com/greatwork.html"
+print("rag_tool> Ingesting document:", source)
+document = RAGDocument(
+    document_id="document_1",
+    content=source,
+    mime_type="text/html",
+    metadata={},
+)
+client.tool_runtime.rag_tool.insert(
+    documents=[document],
+    vector_db_id=vector_db_id,
+    chunk_size_in_tokens=100,
+)
+agent = Agent(
+    client,
+    model=model_id,
+    instructions="You are a helpful assistant",
+    tools=[
+        {
+            "name": "builtin::rag/knowledge_search",
+            "args": {"vector_db_ids": [vector_db_id]},
+        }
+    ],
 )

-print(resp)
+prompt = "How do you do great work?"
+print("prompt>", prompt)
+
+use_stream = True
+response = agent.create_turn(
+    messages=[{"role": "user", "content": prompt}],
+    session_id=agent.create_session("rag_session"),
+    stream=use_stream,
+)
+
+# Only call `AgentEventLogger().log(response)` for streaming responses.
+if use_stream:
+    for log in AgentEventLogger().log(response):
+        log.print()
+else:
+    print(response)
--- a/docs/docs/getting_started/detailed_tutorial.mdx
+++ b/docs/docs/getting_started/detailed_tutorial.mdx
@ -58,19 +58,15 @@ Llama Stack is a server that exposes multiple APIs, you connect with it using th

 <Tabs>
 <TabItem value="venv" label="Using venv">
-You can use Python to install dependencies and run the Llama Stack server, which is useful for testing and development.
+You can use Python to build and run the Llama Stack server, which is useful for testing and development.

 Llama Stack uses a [YAML configuration file](../distributions/configuration) to specify the stack setup,
 which defines the providers and their settings. The generated configuration serves as a starting point that you can [customize for your specific needs](../distributions/customizing_run_yaml).
-Now let's install dependencies and run the Llama Stack config for Ollama.
+Now let's build and run the Llama Stack config for Ollama.
 We use `starter` as template. By default all providers are disabled, this requires enable ollama by passing environment variables.

 ```bash
-# Install dependencies for the starter distribution
-uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
-
-# Run the server
-llama stack run starter
+llama stack build --distro starter --image-type venv --run
 ```
 </TabItem>
 <TabItem value="container" label="Using a Container">
@ -90,9 +86,9 @@ docker run -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
-  -e OLLAMA_URL=http://host.docker.internal:11434 \
  llamastack/distribution-starter \
-  --port $LLAMA_STACK_PORT
+  --port $LLAMA_STACK_PORT \
+  --env OLLAMA_URL=http://host.docker.internal:11434
 ```
 Note to start the container with Podman, you can do the same but replace `docker` at the start of the command with
 `podman`. If you are using `podman` older than `4.7.0`, please also replace `host.docker.internal` in the `OLLAMA_URL`
@ -110,9 +106,9 @@ docker run -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  --network=host \
-  -e OLLAMA_URL=http://localhost:11434 \
  llamastack/distribution-starter \
-  --port $LLAMA_STACK_PORT
+  --port $LLAMA_STACK_PORT \
+  --env OLLAMA_URL=http://localhost:11434
 ```
 :::
 You will see output like below:
@ -168,7 +164,7 @@ Available Models
 ┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┓
 ┃ model_type      ┃ identifier                          ┃ provider_resource_id                ┃ metadata                                  ┃ provider_id           ┃
 ┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━┩
-│ embedding       │ ollama/nomic-embed-text:v1.5        │ nomic-embed-text:v1.5               │ {'embedding_dimension': 768.0}            │ ollama                │
+│ embedding       │ ollama/all-minilm:l6-v2             │ all-minilm:l6-v2                    │ {'embedding_dimension': 384.0}            │ ollama                │
 ├─────────────────┼─────────────────────────────────────┼─────────────────────────────────────┼───────────────────────────────────────────┼───────────────────────┤
 │ ...             │ ...                                 │ ...                                 │                                           │ ...                   │
 ├─────────────────┼─────────────────────────────────────┼─────────────────────────────────────┼───────────────────────────────────────────┼───────────────────────┤
@ -308,7 +304,7 @@ stream = agent.create_turn(
 for event in AgentEventLogger().log(stream):
    event.print()
 ```
-#### ii. Run the Script
+### ii. Run the Script
 Let's run the script using `uv`
 ```bash
 uv run python agent.py
--- a/docs/docs/getting_started/quickstart.mdx
+++ b/docs/docs/getting_started/quickstart.mdx
@ -24,62 +24,111 @@ ollama run llama3.2:3b --keepalive 60m

 #### Step 2: Run the Llama Stack server

-We will use `uv` to install dependencies and run the Llama Stack server.
+We will use `uv` to run the Llama Stack server.
 ```bash
-# Install dependencies for the starter distribution
-uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
-
-# Run the server
-OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter
+OLLAMA_URL=http://localhost:11434 \
+  uv run --with llama-stack llama stack build --distro starter --image-type venv --run
 ```
 #### Step 3: Run the demo
 Now open up a new terminal and copy the following script into a file named `demo_script.py`.

-```python
-import io, requests
-from openai import OpenAI
+```python title="demo_script.py"
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.

-url="https://www.paulgraham.com/greatwork.html"
-client = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")
+from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient

-vs = client.vector_stores.create()
-response = requests.get(url)
-pseudo_file = io.BytesIO(str(response.content).encode('utf-8'))
-uploaded_file = client.files.create(file=(url, pseudo_file, "text/html"), purpose="assistants")
-client.vector_stores.files.create(vector_store_id=vs.id, file_id=uploaded_file.id)
+vector_db_id = "my_demo_vector_db"
+client = LlamaStackClient(base_url="http://localhost:8321")

-resp = client.responses.create(
-    model="openai/gpt-4o",
-    input="How do you do great work? Use the existing knowledge_search tool.",
-    tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
-    include=["file_search_call.results"],
+models = client.models.list()
+
+# Select the first LLM and first embedding models
+model_id = next(m for m in models if m.model_type == "llm").identifier
+embedding_model_id = (
+    em := next(m for m in models if m.model_type == "embedding")
+).identifier
+embedding_dimension = em.metadata["embedding_dimension"]
+
+vector_db = client.vector_dbs.register(
+    vector_db_id=vector_db_id,
+    embedding_model=embedding_model_id,
+    embedding_dimension=embedding_dimension,
+    provider_id="faiss",
+)
+vector_db_id = vector_db.identifier
+source = "https://www.paulgraham.com/greatwork.html"
+print("rag_tool> Ingesting document:", source)
+document = RAGDocument(
+    document_id="document_1",
+    content=source,
+    mime_type="text/html",
+    metadata={},
+)
+client.tool_runtime.rag_tool.insert(
+    documents=[document],
+    vector_db_id=vector_db_id,
+    chunk_size_in_tokens=100,
+)
+agent = Agent(
+    client,
+    model=model_id,
+    instructions="You are a helpful assistant",
+    tools=[
+        {
+            "name": "builtin::rag/knowledge_search",
+            "args": {"vector_db_ids": [vector_db_id]},
+        }
+    ],
 )

+prompt = "How do you do great work?"
+print("prompt>", prompt)

+use_stream = True
+response = agent.create_turn(
+    messages=[{"role": "user", "content": prompt}],
+    session_id=agent.create_session("rag_session"),
+    stream=use_stream,
+)
+
+# Only call `AgentEventLogger().log(response)` for streaming responses.
+if use_stream:
+    for log in AgentEventLogger().log(response):
+        log.print()
+else:
+    print(response)
+```
 We will use `uv` to run the script
 ```
 uv run --with llama-stack-client,fire,requests demo_script.py
 ```
 And you should see output like below.
-```python
->print(resp.output[1].content[0].text)
-To do great work, consider the following principles:
-
-1. **Follow Your Interests**: Engage in work that genuinely excites you. If you find an area intriguing, pursue it without being overly concerned about external pressures or norms. You should create things that you would want for yourself, as this often aligns with what others in your circle might want too.
-
-2. **Work Hard on Ambitious Projects**: Ambition is vital, but it should be tempered by genuine interest. Instead of detailed planning for the future, focus on exciting projects that keep your options open. This approach, known as "staying upwind," allows for adaptability and can lead to unforeseen achievements.
-
-3. **Choose Quality Colleagues**: Collaborating with talented colleagues can significantly affect your own work. Seek out individuals who offer surprising insights and whom you admire. The presence of good colleagues can elevate the quality of your work and inspire you.
-
-4. **Maintain High Morale**: Your attitude towards work and life affects your performance. Cultivating optimism and viewing yourself as lucky rather than victimized can boost your productivity. It’s essential to care for your physical health as well since it directly impacts your mental faculties and morale.
-
-5. **Be Consistent**: Great work often comes from cumulative effort. Daily progress, even in small amounts, can result in substantial achievements over time. Emphasize consistency and make the work engaging, as this reduces the perceived burden of hard labor.
-
-6. **Embrace Curiosity**: Curiosity is a driving force that can guide you in selecting fields of interest, pushing you to explore uncharted territories. Allow it to shape your work and continually seek knowledge and insights.
-
-By focusing on these aspects, you can create an environment conducive to great work and personal fulfillment.
 ```
+rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html

+prompt> How do you do great work?
+
+inference> [knowledge_search(query="What is the key to doing great work")]
+
+tool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}
+
+tool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 2:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 3:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 4:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 5:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text='END of knowledge_search tool results.\n', type='text')]
+
+inference> Based on the search results, it seems that doing great work means doing something important so well that you expand people's ideas of what's possible. However, there is no clear threshold for importance, and it can be difficult to judge at the time.
+
+To further clarify, I would suggest that doing great work involves:
+
+* Completing tasks with high quality and attention to detail
+* Expanding on existing knowledge or ideas
+* Making a positive impact on others through your work
+* Striving for excellence and continuous improvement
+
+Ultimately, great work is about making a meaningful contribution and leaving a lasting impression.
+```
 Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳

 :::tip HuggingFace access
--- a/docs/docs/providers/agents/inline_meta-reference.mdx
+++ b/docs/docs/providers/agents/inline_meta-reference.mdx
@ -14,18 +14,16 @@ Meta's reference implementation of an agent system that can use tools, access ve

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `persistence` | `<class 'inline.agents.meta_reference.config.AgentPersistenceConfig'>` | No |  |  |
+| `persistence_store` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |
+| `responses_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite |  |

 ## Sample Configuration

 ```yaml
-persistence:
-  agent_state:
-    namespace: agents
-    backend: kv_default
-  responses:
-    table_name: responses
-    backend: sql_default
-    max_write_queue_size: 10000
-    num_writers: 4
+persistence_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/agents_store.db
+responses_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/responses_store.db
 ```
--- a/docs/docs/providers/batches/inline_reference.mdx
+++ b/docs/docs/providers/batches/inline_reference.mdx
@ -14,7 +14,7 @@ Reference implementation of batches API with KVStore persistence.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Configuration for the key-value store backend. |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Configuration for the key-value store backend. |
 | `max_concurrent_batches` | `<class 'int'>` | No | 1 | Maximum number of concurrent batches to process simultaneously. |
 | `max_concurrent_requests_per_batch` | `<class 'int'>` | No | 10 | Maximum number of concurrent requests to process per batch. |

@ -22,6 +22,6 @@ Reference implementation of batches API with KVStore persistence.

 ```yaml
 kvstore:
-  namespace: batches
-  backend: kv_default
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/batches.db
 ```
--- a/docs/docs/providers/datasetio/inline_localfs.mdx
+++ b/docs/docs/providers/datasetio/inline_localfs.mdx
@ -14,12 +14,12 @@ Local filesystem-based dataset I/O provider for reading and writing datasets to

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |

 ## Sample Configuration

 ```yaml
 kvstore:
-  namespace: datasetio::localfs
-  backend: kv_default
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/localfs_datasetio.db
 ```
--- a/docs/docs/providers/datasetio/remote_huggingface.mdx
+++ b/docs/docs/providers/datasetio/remote_huggingface.mdx
@ -14,12 +14,12 @@ HuggingFace datasets provider for accessing and managing datasets from the Huggi

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |

 ## Sample Configuration

 ```yaml
 kvstore:
-  namespace: datasetio::huggingface
-  backend: kv_default
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/huggingface_datasetio.db
 ```
--- a/docs/docs/providers/eval/index.mdx
+++ b/docs/docs/providers/eval/index.mdx
@ -1,7 +1,5 @@
 ---
-description: "Evaluations
-
-    Llama Stack Evaluation API for running evaluations on model and agent candidates."
+description: "Llama Stack Evaluation API for running evaluations on model and agent candidates."
 sidebar_label: Eval
 title: Eval
 ---
@ -10,8 +8,6 @@ title: Eval

 ## Overview

-Evaluations
-
-    Llama Stack Evaluation API for running evaluations on model and agent candidates.
+Llama Stack Evaluation API for running evaluations on model and agent candidates.

 This section contains documentation for all available providers for the **eval** API.
--- a/docs/docs/providers/eval/inline_meta-reference.mdx
+++ b/docs/docs/providers/eval/inline_meta-reference.mdx
@ -14,12 +14,12 @@ Meta's reference implementation of evaluation tasks with support for multiple la

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |

 ## Sample Configuration

 ```yaml
 kvstore:
-  namespace: eval
-  backend: kv_default
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/meta_reference_eval.db
 ```
--- a/docs/docs/providers/external/external-providers-guide.mdx
+++ b/docs/docs/providers/external/external-providers-guide.mdx
@ -11,6 +11,38 @@ an example entry in your build.yaml should look like:
  module: ramalama_stack
 ```

+Additionally you can configure the `external_providers_dir` in your Llama Stack configuration. This method is in the process of being deprecated in favor of the `module` method. If using this method, the external provider directory should contain your external provider specifications:
+
+```yaml
+external_providers_dir: ~/.llama/providers.d/
+```
+
+## Directory Structure
+
+The external providers directory should follow this structure:
+
+```
+providers.d/
+  remote/
+    inference/
+      custom_ollama.yaml
+      vllm.yaml
+    vector_io/
+      qdrant.yaml
+    safety/
+      llama-guard.yaml
+  inline/
+    inference/
+      custom_ollama.yaml
+      vllm.yaml
+    vector_io/
+      qdrant.yaml
+    safety/
+      llama-guard.yaml
+```
+
+Each YAML file in these directories defines a provider specification for that particular API.
+
 ## Provider Types

 Llama Stack supports two types of external providers:
@ -18,37 +50,30 @@ Llama Stack supports two types of external providers:
 1. **Remote Providers**: Providers that communicate with external services (e.g., cloud APIs)
 2. **Inline Providers**: Providers that run locally within the Llama Stack process

-
-### Provider Specification (Common between inline and remote providers)
-
- `provider_type`: The type of the provider to be installed (remote or inline). eg. `remote::ollama`
- `api`: The API for this provider, eg. `inference`
- `config_class`: The full path to the configuration class
- `module`: The Python module containing the provider implementation
- `optional_api_dependencies`: List of optional Llama Stack APIs that this provider can use
- `api_dependencies`:  List of Llama Stack APIs that this provider depends on
- `provider_data_validator`: Optional validator for provider data.
- `pip_packages`: List of Python packages required by the provider
-
 ### Remote Provider Specification

 Remote providers are used when you need to communicate with external services. Here's an example for a custom Ollama provider:

 ```yaml
-adapter_type: custom_ollama
-provider_type: "remote::ollama"
-pip_packages:
- ollama
- aiohttp
-config_class: llama_stack_ollama_provider.config.OllamaImplConfig
-module: llama_stack_ollama_provider
+adapter:
+  adapter_type: custom_ollama
+  pip_packages:
+  - ollama
+  - aiohttp
+  config_class: llama_stack_ollama_provider.config.OllamaImplConfig
+  module: llama_stack_ollama_provider
 api_dependencies: []
 optional_api_dependencies: []
 ```

-#### Remote Provider Configuration
+#### Adapter Configuration

- `adapter_type`: A unique identifier for this adapter, eg. `ollama`
+The `adapter` section defines how to load and configure the provider:
+
+- `adapter_type`: A unique identifier for this adapter
+- `pip_packages`: List of Python packages required by the provider
+- `config_class`: The full path to the configuration class
+- `module`: The Python module containing the provider implementation

 ### Inline Provider Specification

@ -56,7 +81,6 @@ Inline providers run locally within the Llama Stack process. Here's an example f

 ```yaml
 module: llama_stack_vector_provider
-provider_type: inline::llama_stack_vector_provider
 config_class: llama_stack_vector_provider.config.VectorStoreConfig
 pip_packages:
  - faiss-cpu
@ -71,6 +95,12 @@ container_image: custom-vector-store:latest  # optional

 #### Inline Provider Fields

+- `module`: The Python module containing the provider implementation
+- `config_class`: The full path to the configuration class
+- `pip_packages`: List of Python packages required by the provider
+- `api_dependencies`: List of Llama Stack APIs that this provider depends on
+- `optional_api_dependencies`: List of optional Llama Stack APIs that this provider can use
+- `provider_data_validator`: Optional validator for provider data
 - `container_image`: Optional container image to use instead of pip packages

 ## Required Fields
@ -83,17 +113,20 @@ All providers must contain a `get_provider_spec` function in their `provider` mo
 from llama_stack.providers.datatypes import (
    ProviderSpec,
    Api,
-    RemoteProviderSpec,
+    AdapterSpec,
+    remote_provider_spec,
 )


 def get_provider_spec() -> ProviderSpec:
-    return RemoteProviderSpec(
+    return remote_provider_spec(
        api=Api.inference,
-        adapter_type="ramalama",
-        pip_packages=["ramalama>=0.8.5", "pymilvus"],
-        config_class="ramalama_stack.config.RamalamaImplConfig",
-        module="ramalama_stack",
+        adapter=AdapterSpec(
+            adapter_type="ramalama",
+            pip_packages=["ramalama>=0.8.5", "pymilvus"],
+            config_class="ramalama_stack.config.RamalamaImplConfig",
+            module="ramalama_stack",
+        ),
    )
 ```

@ -164,16 +197,18 @@ information. Execute the test for the Provider type you are developing.
 If your external provider isn't being loaded:

 1. Check that `module` points to a published pip package with a top level `provider` module including `get_provider_spec`.
+1. Check that the `external_providers_dir` path is correct and accessible.
 2. Verify that the YAML files are properly formatted.
 3. Ensure all required Python packages are installed.
 4. Check the Llama Stack server logs for any error messages - turn on debug logging to get more
   information using `LLAMA_STACK_LOGGING=all=debug`.
+5. Verify that the provider package is installed in your Python environment if using `external_providers_dir`.

 ## Examples

-### How to create an external provider module
+### Example using `external_providers_dir`: Custom Ollama Provider

-If you are creating a new external provider called `llama-stack-provider-ollama` here is how you would set up the package properly:
+Here's a complete example of creating and using a custom Ollama provider:

 1. First, create the provider package:

@ -195,28 +230,33 @@ requires-python = ">=3.12"
 dependencies = ["llama-stack", "pydantic", "ollama", "aiohttp"]
 ```

-3. Install the provider:
+3. Create the provider specification:
+
+```yaml
+# ~/.llama/providers.d/remote/inference/custom_ollama.yaml
+adapter:
+  adapter_type: custom_ollama
+  pip_packages: ["ollama", "aiohttp"]
+  config_class: llama_stack_provider_ollama.config.OllamaImplConfig
+  module: llama_stack_provider_ollama
+api_dependencies: []
+optional_api_dependencies: []
+```
+
+4. Install the provider:

 ```bash
 uv pip install -e .
 ```

-4. Edit `provider.py`
+5. Configure Llama Stack to use external providers:

-provider.py must be updated to contain `get_provider_spec`. This is used by llama stack to install the provider.
-
-```python
-def get_provider_spec() -> ProviderSpec:
-    return RemoteProviderSpec(
-        api=Api.inference,
-        adapter_type="llama-stack-provider-ollama",
-        pip_packages=["ollama", "aiohttp"],
-        config_class="llama_stack_provider_ollama.config.OllamaImplConfig",
-        module="llama_stack_provider_ollama",
-    )
+```yaml
+external_providers_dir: ~/.llama/providers.d/
 ```

-5. Implement the provider as outlined above with `get_provider_impl` or `get_adapter_impl`, etc.
+The provider will now be available in Llama Stack with the type `remote::custom_ollama`.
+

 ### Example using `module`: ramalama-stack

@ -235,11 +275,12 @@ distribution_spec:
      module: ramalama_stack==0.3.0a0
 image_type: venv
 image_name: null
+external_providers_dir: null
 additional_pip_packages:
 - aiosqlite
 - sqlalchemy[asyncio]
 ```

-No other steps are required beyond installing dependencies with `llama stack list-deps <distro> | xargs -L1 uv pip install` and then running `llama stack run`. The CLI will use `module` to install the provider dependencies, retrieve the spec, etc.
+No other steps are required other than `llama stack build` and `llama stack run`. The build process will use `module` to install all of the provider dependencies, retrieve the spec, etc.

 The provider will now be available in Llama Stack with the type `remote::ramalama`.
--- a/docs/docs/providers/files/index.mdx
+++ b/docs/docs/providers/files/index.mdx
@ -1,7 +1,4 @@
 ---
-description: "Files
-
-    This API is used to upload documents that can be used with other Llama Stack APIs."
 sidebar_label: Files
 title: Files
 ---
@ -10,8 +7,4 @@ title: Files

 ## Overview

-Files
-
-    This API is used to upload documents that can be used with other Llama Stack APIs.
-
 This section contains documentation for all available providers for the **files** API.
--- a/docs/docs/providers/files/inline_localfs.mdx
+++ b/docs/docs/providers/files/inline_localfs.mdx
@ -15,7 +15,7 @@ Local filesystem-based file storage provider for managing files and documents lo
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `storage_dir` | `<class 'str'>` | No |  | Directory to store uploaded files |
-| `metadata_store` | `<class 'llama_stack.core.storage.datatypes.SqlStoreReference'>` | No |  | SQL store configuration for file metadata |
+| `metadata_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | SQL store configuration for file metadata |
 | `ttl_secs` | `<class 'int'>` | No | 31536000 |  |

 ## Sample Configuration
@ -23,6 +23,6 @@ Local filesystem-based file storage provider for managing files and documents lo
 ```yaml
 storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/dummy/files}
 metadata_store:
-  table_name: files_metadata
-  backend: sql_default
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/files_metadata.db
 ```
--- a/docs/docs/providers/files/remote_s3.mdx
+++ b/docs/docs/providers/files/remote_s3.mdx
@ -20,7 +20,7 @@ AWS S3-based file storage provider for scalable cloud file management with metad
 | `aws_secret_access_key` | `str \| None` | No |  | AWS secret access key (optional if using IAM roles) |
 | `endpoint_url` | `str \| None` | No |  | Custom S3 endpoint URL (for MinIO, LocalStack, etc.) |
 | `auto_create_bucket` | `<class 'bool'>` | No | False | Automatically create the S3 bucket if it doesn't exist |
-| `metadata_store` | `<class 'llama_stack.core.storage.datatypes.SqlStoreReference'>` | No |  | SQL store configuration for file metadata |
+| `metadata_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | SQL store configuration for file metadata |

 ## Sample Configuration

@ -32,6 +32,6 @@ aws_secret_access_key: ${env.AWS_SECRET_ACCESS_KEY:=}
 endpoint_url: ${env.S3_ENDPOINT_URL:=}
 auto_create_bucket: ${env.S3_AUTO_CREATE_BUCKET:=false}
 metadata_store:
-  table_name: s3_files_metadata
-  backend: sql_default
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/s3_files_metadata.db
 ```
--- a/docs/docs/providers/index.mdx
+++ b/docs/docs/providers/index.mdx
@ -22,6 +22,7 @@ Importantly, Llama Stack always strives to provide at least one fully inline pro
 ## Provider Categories

 - **[External Providers](external/index.mdx)** - Guide for building and using external providers
+- **[OpenAI Compatibility](./openai.mdx)** - OpenAI API compatibility layer
 - **[Inference](inference/index.mdx)** - LLM and embedding model providers
 - **[Agents](agents/index.mdx)** - Agentic system providers
 - **[DatasetIO](datasetio/index.mdx)** - Dataset and data loader providers
@ -30,7 +31,3 @@ Importantly, Llama Stack always strives to provide at least one fully inline pro
 - **[Vector IO](vector_io/index.mdx)** - Vector database providers
 - **[Tool Runtime](tool_runtime/index.mdx)** - Tool and protocol providers
 - **[Files](files/index.mdx)** - File system and storage providers
-
-## Other information about Providers
- **[OpenAI Compatibility](./openai.mdx)** - OpenAI API compatibility layer
- **[OpenAI-Compatible Responses Limitations](./openai_responses_limitations.mdx)** - Known limitations of the Responses API in Llama Stack
--- a/docs/docs/providers/inference/index.mdx
+++ b/docs/docs/providers/inference/index.mdx
@ -1,7 +1,5 @@
 ---
-description: "Inference
-
-    Llama Stack Inference API for generating completions, chat completions, and embeddings.
+description: "Llama Stack Inference API for generating completions, chat completions, and embeddings.

    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
@ -14,9 +12,7 @@ title: Inference

 ## Overview

-Inference
-
-    Llama Stack Inference API for generating completions, chat completions, and embeddings.
+Llama Stack Inference API for generating completions, chat completions, and embeddings.

    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate "raw" and "chat" (conversational) completions.
--- a/docs/docs/providers/inference/remote_anthropic.mdx
+++ b/docs/docs/providers/inference/remote_anthropic.mdx
@ -14,9 +14,7 @@ Anthropic inference provider for accessing Claude models and Anthropic's AI serv

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
+| `api_key` | `str \| None` | No |  | API key for Anthropic models |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_azure.mdx
+++ b/docs/docs/providers/inference/remote_azure.mdx
@ -21,9 +21,7 @@ https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
+| `api_key` | `<class 'pydantic.types.SecretStr'>` | No |  | Azure API key for Azure |
 | `api_base` | `<class 'pydantic.networks.HttpUrl'>` | No |  | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) |
 | `api_version` | `str \| None` | No |  | Azure API version for Azure (e.g., 2024-12-01-preview) |
 | `api_type` | `str \| None` | No | azure | Azure API type for Azure (e.g., azure) |
--- a/docs/docs/providers/inference/remote_bedrock.mdx
+++ b/docs/docs/providers/inference/remote_bedrock.mdx
@ -14,8 +14,6 @@ AWS Bedrock inference provider for accessing various AI models through AWS's man

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
 | `aws_access_key_id` | `str \| None` | No |  | The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID |
 | `aws_secret_access_key` | `str \| None` | No |  | The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY |
 | `aws_session_token` | `str \| None` | No |  | The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN |
--- a/docs/docs/providers/inference/remote_cerebras.mdx
+++ b/docs/docs/providers/inference/remote_cerebras.mdx
@ -14,10 +14,8 @@ Cerebras inference provider for running models on Cerebras Cloud platform.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `base_url` | `<class 'str'>` | No | https://api.cerebras.ai | Base URL for the Cerebras API |
+| `api_key` | `<class 'pydantic.types.SecretStr'>` | No |  | Cerebras API Key |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_databricks.mdx
+++ b/docs/docs/providers/inference/remote_databricks.mdx
@ -14,10 +14,8 @@ Databricks inference provider for running models on Databricks' unified analytic

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_token` | `pydantic.types.SecretStr \| None` | No |  | The Databricks API token |
-| `url` | `str \| None` | No |  | The URL for the Databricks model serving endpoint |
+| `url` | `<class 'str'>` | No |  | The URL for the Databricks model serving endpoint |
+| `api_token` | `<class 'pydantic.types.SecretStr'>` | No |  | The Databricks API token |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_fireworks.mdx
+++ b/docs/docs/providers/inference/remote_fireworks.mdx
@ -15,9 +15,8 @@ Fireworks AI inference provider for Llama models and other AI models on the Fire
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `url` | `<class 'str'>` | No | https://api.fireworks.ai/inference/v1 | The URL for the Fireworks server |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | The Fireworks.ai API Key |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_gemini.mdx
+++ b/docs/docs/providers/inference/remote_gemini.mdx
@ -14,9 +14,7 @@ Google Gemini inference provider for accessing Gemini models and Google's AI ser

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
+| `api_key` | `str \| None` | No |  | API key for Gemini models |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_groq.mdx
+++ b/docs/docs/providers/inference/remote_groq.mdx
@ -14,9 +14,7 @@ Groq inference provider for ultra-fast inference using Groq's LPU technology.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
+| `api_key` | `str \| None` | No |  | The Groq API key |
 | `url` | `<class 'str'>` | No | https://api.groq.com | The URL for the Groq AI server |

 ## Sample Configuration
--- a/docs/docs/providers/inference/remote_llama-openai-compat.mdx
+++ b/docs/docs/providers/inference/remote_llama-openai-compat.mdx
@ -14,9 +14,7 @@ Llama OpenAI-compatible provider for using Llama models with OpenAI API format.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
+| `api_key` | `str \| None` | No |  | The Llama API key |
 | `openai_compat_api_base` | `<class 'str'>` | No | https://api.llama.com/compat/v1/ | The URL for the Llama API server |

 ## Sample Configuration
--- a/docs/docs/providers/inference/remote_nvidia.mdx
+++ b/docs/docs/providers/inference/remote_nvidia.mdx
@ -14,10 +14,8 @@ NVIDIA inference provider for accessing NVIDIA NIM models and AI services.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `url` | `<class 'str'>` | No | https://integrate.api.nvidia.com | A base url for accessing the NVIDIA NIM |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | The NVIDIA API key, only needed of using the hosted service |
 | `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |
 | `append_api_version` | `<class 'bool'>` | No | True | When set to false, the API version will not be appended to the base_url. By default, it is true. |

--- a/docs/docs/providers/inference/remote_ollama.mdx
+++ b/docs/docs/providers/inference/remote_ollama.mdx
@ -14,9 +14,8 @@ Ollama inference provider for running local models through the Ollama runtime.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
 | `url` | `<class 'str'>` | No | http://localhost:11434 |  |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_openai.mdx
+++ b/docs/docs/providers/inference/remote_openai.mdx
@ -14,9 +14,7 @@ OpenAI inference provider for accessing GPT models and other OpenAI services.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
+| `api_key` | `str \| None` | No |  | API key for OpenAI models |
 | `base_url` | `<class 'str'>` | No | https://api.openai.com/v1 | Base URL for OpenAI API |

 ## Sample Configuration
--- a/docs/docs/providers/inference/remote_passthrough.mdx
+++ b/docs/docs/providers/inference/remote_passthrough.mdx
@ -14,10 +14,8 @@ Passthrough inference provider for connecting to any external inference service

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | API Key for the passthrouth endpoint |
 | `url` | `<class 'str'>` | No |  | The URL for the passthrough endpoint |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | API Key for the passthrouth endpoint |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_runpod.mdx
+++ b/docs/docs/providers/inference/remote_runpod.mdx
@ -14,10 +14,8 @@ RunPod inference provider for running models on RunPod's cloud GPU platform.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_token` | `pydantic.types.SecretStr \| None` | No |  | The API token |
 | `url` | `str \| None` | No |  | The URL for the Runpod model serving endpoint |
+| `api_token` | `str \| None` | No |  | The API token |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_sambanova.mdx
+++ b/docs/docs/providers/inference/remote_sambanova.mdx
@ -14,10 +14,8 @@ SambaNova inference provider for running models on SambaNova's dataflow architec

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `url` | `<class 'str'>` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | The SambaNova cloud API Key |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_tgi.mdx
+++ b/docs/docs/providers/inference/remote_tgi.mdx
@ -14,8 +14,6 @@ Text Generation Inference (TGI) provider for HuggingFace model serving.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
 | `url` | `<class 'str'>` | No |  | The URL for the TGI serving endpoint |

 ## Sample Configuration
--- a/docs/docs/providers/inference/remote_together.mdx
+++ b/docs/docs/providers/inference/remote_together.mdx
@ -15,9 +15,8 @@ Together AI inference provider for open-source models and collaborative AI devel
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `url` | `<class 'str'>` | No | https://api.together.xyz/v1 | The URL for the Together AI server |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | The Together AI API Key |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_vertexai.mdx
+++ b/docs/docs/providers/inference/remote_vertexai.mdx
@ -53,8 +53,6 @@ Available Models:

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
 | `project` | `<class 'str'>` | No |  | Google Cloud project ID for Vertex AI |
 | `location` | `<class 'str'>` | No | us-central1 | Google Cloud location for Vertex AI |

--- a/docs/docs/providers/inference/remote_vllm.mdx
+++ b/docs/docs/providers/inference/remote_vllm.mdx
@ -14,12 +14,11 @@ Remote vLLM inference provider for connecting to vLLM servers.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_token` | `pydantic.types.SecretStr \| None` | No |  | The API token |
 | `url` | `str \| None` | No |  | The URL for the vLLM model serving endpoint |
 | `max_tokens` | `<class 'int'>` | No | 4096 | Maximum number of tokens to generate. |
+| `api_token` | `str \| None` | No | fake | The API token |
 | `tls_verify` | `bool \| str` | No | True | Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file. |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_watsonx.mdx
+++ b/docs/docs/providers/inference/remote_watsonx.mdx
@ -14,11 +14,9 @@ IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `url` | `<class 'str'>` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
-| `project_id` | `str \| None` | No |  | The watsonx.ai project ID |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | The watsonx API key |
+| `project_id` | `str \| None` | No |  | The Project ID key |
 | `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |

 ## Sample Configuration
--- a/docs/docs/providers/openai.mdx
+++ b/docs/docs/providers/openai.mdx
@ -1,4 +1,3 @@
---
 title: OpenAI Compatibility
 description: OpenAI API Compatibility
 sidebar_label: OpenAI Compatibility
@ -48,7 +47,7 @@ models = client.models.list()

 #### Responses

-> **Note:** The Responses API implementation is still in active development. While it is quite usable, there are still unimplemented parts of the API.  See [Known Limitations of the OpenAI-compatible Responses API in Llama Stack](./openai_responses_limitations.mdx) for more details.
+> **Note:** The Responses API implementation is still in active development. While it is quite usable, there are still unimplemented parts of the API. We'd love feedback on any use-cases you try that do not work to help prioritize the pieces left to implement. Please open issues in the [meta-llama/llama-stack](https://github.com/meta-llama/llama-stack) GitHub repository with details of anything that does not work.

 ##### Simple inference

--- a/docs/docs/providers/openai_responses_limitations.mdx
+++ b/docs/docs/providers/openai_responses_limitations.mdx
@ -1,301 +0,0 @@
---
-title: Known Limitations of the OpenAI-compatible Responses API in Llama Stack
-description: Limitations of Responses API
-sidebar_label: Limitations of Responses API
-sidebar_position: 1
---
-
-## Unresolved Issues
-
-This document outlines known limitations and inconsistencies between Llama Stack's Responses API and OpenAI's Responses API. This comparison is based on OpenAI's API and reflects a comparison with the OpenAI APIs as of October 6, 2025 (OpenAI's client version `openai==1.107`).
-See the OpenAI [changelog](https://platform.openai.com/docs/changelog) for details of any new functionality that has been added since that date. Links to issues are included so readers can read about status, post comments, and/or subscribe for updates relating to any limitations that are of specific interest to them. We would also love any other feedback on any use-cases you try that do not work to help prioritize the pieces left to implement.
-Please open new issues in the [meta-llama/llama-stack](https://github.com/meta-llama/llama-stack) GitHub repository with details of anything that does not work that does not already have an open issue.
-
-### Instructions
-**Status:** Partial Implementation + Work in Progress
-
-**Issue:** [#3566](https://github.com/llamastack/llama-stack/issues/3566)
-
-In Llama Stack, the instructions parameter is already implemented for creating a response, but it is not yet included in the output response object.
-
---
-
-### Streaming
-
-**Status:** Partial Implementation
-
-**Issue:** [#2364](https://github.com/llamastack/llama-stack/issues/2364)
-
-Streaming functionality for the Responses API is partially implemented and does work to some extent, but some streaming response objects that would be needed for full compatibility are still missing.
-
---
-
-### Prompt Templates
-
-**Status:** Partial Implementation
-
-**Issue:** [#3321](https://github.com/llamastack/llama-stack/issues/3321)
-
-OpenAI's platform supports [templated prompts using a structured language](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts). These templates can be stored server-side for organizational sharing. This feature is under development for Llama Stack.
-
---
-
-### Web-search tool compatibility
-
-**Status:** Partial Implementation
-
-Both OpenAI and Llama Stack support a web-search built-in tool.  The [OpenAI documentation](https://platform.openai.com/docs/api-reference/responses/create) for web search tool in a Responses tool list says:
-
-> The type of the web search tool. One of `web_search` or `web_search_2025_08_26`.
-
-In contrast, the [Llama Stack documentation](https://llamastack.github.io/docs/api/create-a-new-open-ai-response) says that the allowed values for `type` for web search are `MOD1`, `MOD2` and `MOD3`.
-Is that correct?  If so, what are the meanings of each of them?  It might make sense for the allowed values for OpenAI map to some values for Llama Stack so that code written to the OpenAI specification
-also work with Llama Stack.
-
-The OpenAI web search tool also has fields for `filters` and `user_location` which are not documented as options for Llama Stack.  If feasible, it would be good to support these too.
-
---
-
-### Other built-in Tools
-
-**Status:** Partial Implementation
-
-OpenAI's Responses API includes an ecosystem of built-in tools (e.g., code interpreter) that lower the barrier to entry for agentic workflows. These tools are typically aligned with specific model training.
-
-**Current Status in Llama Stack:**
- Some built-in tools exist (file search, web search)
- Missing tools include code interpreter, computer use, and image generation
- Some built-in tools may require additional APIs (e.g., [containers API](https://platform.openai.com/docs/api-reference/containers) for code interpreter)
-
-It's unclear whether there is demand for additional built-in tools in Llama Stack. No upstream issues have been filed for adding more built-in tools.
-
---
-
-### Response Branching
-
-**Status:** Not Working
-
-Response branching, as discussed in the [Agents vs OpenAI Responses API documentation](https://llamastack.github.io/docs/building_applications/responses_vs_agents), is not currently functional.
-
---
-
-### Include
-
-**Status:** Not Implemented
-
-The `include` parameter allows you to provide a list of values that indicate additional information for the system to include in the model response.  The [OpenAI API](https://platform.openai.com/docs/api-reference/responses/create) specifies the following allowed values for this parameter.
-
- `web_search_call.action.sources`
- `code_interpreter_call.outputs`
- `computer_call_output.output.image_url`
- `file_search_call.results`
- `message.input_image.image_url`
- `message.output_text.logprobs`
- `reasoning.encrypted_content`
-
-Some of these are not relevant to Llama Stack in its current form. For example, code interpreter is not implemented (see "Built-in tools" below), so `code_interpreter_call.outputs` would not be a useful directive to Llama Stack.
-
-However, others might be useful. For example, `message.output_text.logprobs` can be useful for assessing how confident a model is in each token of its output.
-
---
-
-### Tool Choice
-
-**Status:** Not Implemented
-
-**Issue:** [#3548](https://github.com/llamastack/llama-stack/issues/3548)
-
-In OpenAI's API, the `tool_choice` parameter allows you to set restrictions or requirements for which tools should be used when generating a response. This feature is not implemented in Llama Stack.
-
---
-
-### Safety Identification and Tracking
-
-**Status:** Not Implemented
-
-OpenAI's platform allows users to track agentic users using a safety identifier passed with each response. When requests violate moderation or safety rules, account holders are alerted and automated actions can be taken. This capability is not currently available in Llama Stack.
-
---
-
-### Connectors
-
-**Status:** Not Implemented
-
-Connectors are MCP servers maintained and managed by the Responses API provider. OpenAI has documented their connectors at [https://platform.openai.com/docs/guides/tools-connectors-mcp](https://platform.openai.com/docs/guides/tools-connectors-mcp).
-
-**Open Questions:**
- Should Llama Stack include built-in support for some, all, or none of OpenAI's connectors?
- Should there be a mechanism for administrators to add custom connectors via `run.yaml` or an API?
-
---
-
-### Reasoning
-
-**Status:** Partially Implemented
-
-The `reasoning` object in the output of Responses works for inference providers such as vLLM that output reasoning traces in chat completions requests.  It does not work for other providers such as OpenAI's hosted service.  See [#3551](https://github.com/llamastack/llama-stack/issues/3551) for more details.
-
---
-
-### Service Tier
-
-**Status:** Not Implemented
-
-**Issue:** [#3550](https://github.com/llamastack/llama-stack/issues/3550)
-
-Responses has a field `service_tier` that can be used to prioritize access to inference resources.  Not all inference providers have such a concept, but Llama Stack pass through this value for those providers that do.  Currently it does not.
-
---
-
-### Top Logprobs
-
-**Status:** Not Implemented
-
-**Issue:** [#3552](https://github.com/llamastack/llama-stack/issues/3552)
-
-The `top_logprobs` parameter from OpenAI's Responses API extends the functionality obtained by including `message.output_text.logprobs` in the `include` parameter list (as discussed in the Include section above).
-It enables users to also get logprobs for alternative tokens.
-
---
-
-### Max Tool Calls
-
-**Status:** Not Implemented
-
-**Issue:** [#3563](https://github.com/llamastack/llama-stack/issues/3563)
-
-The Responses API can accept a `max_tool_calls` parameter that limits the number of tool calls allowed to be executed for a given response. This feature needs full implementation and documentation.
-
---
-
-### Max Output Tokens
-
-**Status:** Not Implemented
-
-**Issue:** [#3562](https://github.com/llamastack/llama-stack/issues/3562)
-
-The `max_output_tokens` field limits how many tokens the model is allowed to generate (for both reasoning and output combined).  It is not implemented in Llama Stack.
-
---
-
-### Incomplete Details
-
-**Status:** Not Implemented
-
-**Issue:** [#3567](https://github.com/llamastack/llama-stack/issues/3567)
-
-The return object from a call to Responses includes a field for indicating why a response is incomplete if it is.  For example, if the model stops generating because it has reached the specified max output tokens (see above), this field should be set to "IncompleteDetails(reason='max_output_tokens')".  This is not implemented in Llama Stack.
-
---
-
-### Metadata
-
-**Status:** Not Implemented
-
-**Issue:** [#3564](https://github.com/llamastack/llama-stack/issues/3564)
-
-Metadata allows you to attach additional information to a response for your own reference and tracking.  It is not implemented in Llama Stack.
-
---
-
-### Background
-
-**Status:** Not Implemented
-
-**Issue:** [#3568](https://github.com/llamastack/llama-stack/issues/3568)
-
-[Background mode](https://platform.openai.com/docs/guides/background) in OpenAI Responses lets you start a response generation job and then check back in on it later.  This is useful if you might lose a connection during a generation and want to reconnect later and get the response back (for example if the client is running in a mobile app).  It is not implemented in Llama Stack.
-
---
-
-### Global Guardrails
-
-**Status:** Feature Request
-
-When calling the OpenAI Responses API, model outputs go through safety models configured by OpenAI administrators. Perhaps Llama Stack should provide a mechanism to configure safety models (or non-model logic) for all Responses requests, either through `run.yaml` or an administrative API.
-
---
-
-### User-Controlled Guardrails
-
-**Status:** Feature Request
-
-**Issue:** [#3325](https://github.com/llamastack/llama-stack/issues/3325)
-
-OpenAI has not released a way for users to configure their own guardrails. However, Llama Stack users may want this capability to complement or replace global guardrails. This could be implemented as a non-breaking, additive difference from the OpenAI API.
-
---
-
-### MCP Elicitations
-
-**Status:** Unknown
-
-Elicitations allow MCP servers to request additional information from users through the client during interactions (e.g., a tool requesting a username before proceeding).
-See the [MCP specification](https://modelcontextprotocol.io/specification/draft/client/elicitation) for details.
-
-**Open Questions:**
- Does this work in OpenAI's Responses API reference implementation?
- If not, is there a reasonable way to make that work within the API as is? Or would the API need to change?
- Does this work in Llama Stack?
-
---
-
-### MCP Sampling
-
-**Status:** Unknown
-
-Sampling allows MCP tools to query the generative AI model. See the [MCP specification](https://modelcontextprotocol.io/specification/draft/client/sampling) for details.
-
-**Open Questions:**
- Does this work in OpenAI's Responses API reference implementation?
- If not, is there a reasonable way to make that work within the API as is? Or would the API need to change?
- Does this work in Llama Stack?
-
-### Prompt Caching
-
-**Status:** Unknown
-
-OpenAI provides a [prompt caching](https://platform.openai.com/docs/guides/prompt-caching) mechanism in Responses that is enabled for its most recent models.
-
-**Open Questions:**
- Does this work in Llama Stack?
- If not, is there a reasonable way to make that work for those inference providers that have this capability by passing through the provided `prompt_cache_key` to the inference provider?
- Is there a reasonable way to make that work for inference providers that don't build in this capability by doing some sort of caching at the Llama Stack layer?
-
---
-
-### Parallel Tool Calls
-
-**Status:** Rumored Issue
-
-There are reports that `parallel_tool_calls` may not work correctly. This needs verification and a ticket should be opened if confirmed.
-
---
-
-## Resolved Issues
-
-The following limitations have been addressed in recent releases:
-
-### MCP and Function Tools with No Arguments
-
-**Status:** ✅ Resolved
-
-MCP and function tools now work correctly even when they have no arguments.
-
---
-
-### `require_approval` Parameter for MCP Tools
-
-**Status:** ✅ Resolved
-
-The `require_approval` parameter for MCP tools in the Responses API now works correctly.
-
---
-
-### MCP Tools with Array-Type Arguments
-
-**Status:** ✅ Resolved
-
-**Fixed in:** [#3003](https://github.com/llamastack/llama-stack/pull/3003) (Agent API), [#3602](https://github.com/llamastack/llama-stack/pull/3602) (Responses API)
-
-MCP tools now correctly handle array-type arguments in both the Agent API and Responses API.
--- a/docs/docs/providers/safety/index.mdx
+++ b/docs/docs/providers/safety/index.mdx
@ -1,7 +1,4 @@
 ---
-description: "Safety
-
-    OpenAI-compatible Moderations API."
 sidebar_label: Safety
 title: Safety
 ---
@ -10,8 +7,4 @@ title: Safety

 ## Overview

-Safety
-
-    OpenAI-compatible Moderations API.
-
 This section contains documentation for all available providers for the **safety** API.
--- a/docs/docs/providers/safety/remote_bedrock.mdx
+++ b/docs/docs/providers/safety/remote_bedrock.mdx
@ -14,8 +14,6 @@ AWS Bedrock safety provider for content moderation using AWS's safety services.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
 | `aws_access_key_id` | `str \| None` | No |  | The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID |
 | `aws_secret_access_key` | `str \| None` | No |  | The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY |
 | `aws_session_token` | `str \| None` | No |  | The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN |
--- a/docs/docs/providers/telemetry/inline_meta-reference.mdx
+++ b/docs/docs/providers/telemetry/inline_meta-reference.mdx
@ -16,12 +16,14 @@ Meta's reference implementation of telemetry and observability using OpenTelemet
 |-------|------|----------|---------|-------------|
 | `otel_exporter_otlp_endpoint` | `str \| None` | No |  | The OpenTelemetry collector endpoint URL (base URL for traces, metrics, and logs). If not set, the SDK will use OTEL_EXPORTER_OTLP_ENDPOINT environment variable. |
 | `service_name` | `<class 'str'>` | No |  | The service name to use for telemetry |
-| `sinks` | `list[inline.telemetry.meta_reference.config.TelemetrySink` | No | [] | List of telemetry sinks to enable (possible values: otel_trace, otel_metric, console) |
+| `sinks` | `list[inline.telemetry.meta_reference.config.TelemetrySink` | No | [&lt;TelemetrySink.SQLITE: 'sqlite'&gt;] | List of telemetry sinks to enable (possible values: otel_trace, otel_metric, sqlite, console) |
+| `sqlite_db_path` | `<class 'str'>` | No | ~/.llama/runtime/trace_store.db | The path to the SQLite database to use for storing traces |

 ## Sample Configuration

 ```yaml
 service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-sinks: ${env.TELEMETRY_SINKS:=}
+sinks: ${env.TELEMETRY_SINKS:=sqlite}
+sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/trace_store.db
 otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
 ```
--- a/docs/docs/providers/vector_io/inline_chromadb.mdx
+++ b/docs/docs/providers/vector_io/inline_chromadb.mdx
@ -79,13 +79,13 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `db_path` | `<class 'str'>` | No |  |  |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |

 ## Sample Configuration

 ```yaml
 db_path: ${env.CHROMADB_PATH}
-persistence:
-  namespace: vector_io::chroma
-  backend: kv_default
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/chroma_inline_registry.db
 ```
--- a/docs/docs/providers/vector_io/inline_faiss.mdx
+++ b/docs/docs/providers/vector_io/inline_faiss.mdx
@ -95,12 +95,12 @@ more details about Faiss in general.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |

 ## Sample Configuration

 ```yaml
-persistence:
-  namespace: vector_io::faiss
-  backend: kv_default
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/faiss_store.db
 ```
--- a/docs/docs/providers/vector_io/inline_meta-reference.mdx
+++ b/docs/docs/providers/vector_io/inline_meta-reference.mdx
@ -14,14 +14,14 @@ Meta's reference implementation of a vector database.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |

 ## Sample Configuration

 ```yaml
-persistence:
-  namespace: vector_io::faiss
-  backend: kv_default
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/faiss_store.db
 ```
 ## Deprecation Notice

--- a/docs/docs/providers/vector_io/inline_milvus.mdx
+++ b/docs/docs/providers/vector_io/inline_milvus.mdx
@ -17,14 +17,14 @@ Please refer to the remote provider documentation.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `db_path` | `<class 'str'>` | No |  |  |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend (SQLite only for now) |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
 | `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |

 ## Sample Configuration

 ```yaml
 db_path: ${env.MILVUS_DB_PATH:=~/.llama/dummy}/milvus.db
-persistence:
-  namespace: vector_io::milvus
-  backend: kv_default
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/milvus_registry.db
 ```
--- a/docs/docs/providers/vector_io/inline_qdrant.mdx
+++ b/docs/docs/providers/vector_io/inline_qdrant.mdx
@ -98,13 +98,13 @@ See the [Qdrant documentation](https://qdrant.tech/documentation/) for more deta
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `path` | `<class 'str'>` | No |  |  |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite |  |

 ## Sample Configuration

 ```yaml
 path: ${env.QDRANT_PATH:=~/.llama/~/.llama/dummy}/qdrant.db
-persistence:
-  namespace: vector_io::qdrant
-  backend: kv_default
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/qdrant_registry.db
 ```
--- a/docs/docs/providers/vector_io/inline_sqlite-vec.mdx
+++ b/docs/docs/providers/vector_io/inline_sqlite-vec.mdx
@ -408,13 +408,13 @@ See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) f
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `db_path` | `<class 'str'>` | No |  | Path to the SQLite database file |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend (SQLite only for now) |
+| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |

 ## Sample Configuration

 ```yaml
 db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec.db
-persistence:
-  namespace: vector_io::sqlite_vec
-  backend: kv_default
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec_registry.db
 ```
--- a/Show more
+++ b/Show more
				`@ -1 +0,0 @@`
				`tests//recordings/ linguist-generated=true`