Merge branch 'llamastack:main' into langchain_llamastack

2025-12-03 18:00:36 +00:00 · 2025-10-24 15:10:36 -07:00 · 2025-10-24 15:10:36 -07:00 · b785ab1579
commit b785ab1579
parent 1d83b533b1 509676641a
2498 changed files with 1150580 additions and 99046 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,19 @@
 .venv
 __pycache__
 *.pyc
 *.pyo
 *.pyd
 *.so
 .git
 .gitignore
 htmlcov*
 .coverage
 coverage*
 .cache
 .mypy_cache
 .pytest_cache
 .ruff_cache
 uv.lock
 node_modules
 build
 /tmp
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1 @@
 tests/**/recordings/** linguist-generated=true
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@
 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1
+* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1 @franciscojavierarceo
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -2,7 +2,7 @@ blank_issues_enabled: false
 contact_links:
  - name: Have you read the docs?
-    url: https://llamastack.github.io/latest/providers/external/index.html
+    url: https://llamastack.github.io/providers/external/index.html
    about: Much help can be found in the docs
  - name: Start a discussion
    url: https://github.com/llamastack/llama-stack/discussions/new/
--- a/.github/TRIAGERS.md
+++ b/.github/TRIAGERS.md
@ -1,2 +1 @@
 # This file documents Triage members in the Llama Stack community
 @franciscojavierarceo
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -54,6 +54,10 @@ runs:
          SCRIPT_ARGS="$SCRIPT_ARGS --pattern ${{ inputs.pattern }}"
        fi
        echo "=== Running command ==="
        echo "uv run --no-sync ./scripts/integration-tests.sh $SCRIPT_ARGS"
        echo ""
        uv run --no-sync ./scripts/integration-tests.sh $SCRIPT_ARGS | tee pytest-${{ inputs.inference-mode }}.log
@ -62,11 +66,11 @@ runs:
      shell: bash
      run: |
        echo "Checking for recording changes"
-        git status --porcelain tests/integration/recordings/
+        git status --porcelain tests/integration/
-        if [[ -n $(git status --porcelain tests/integration/recordings/) ]]; then
+        if [[ -n $(git status --porcelain tests/integration/) ]]; then
          echo "New recordings detected, committing and pushing"
-          git add tests/integration/recordings/
+          git add tests/integration/
          git commit -m "Recordings update from CI (suite: ${{ inputs.suite }})"
          git fetch origin ${{ github.ref_name }}
@ -78,11 +82,13 @@ runs:
          echo "No recording changes"
        fi
-    - name: Write inference logs to file
+    - name: Write docker logs to file
      if: ${{ always() }}
      shell: bash
      run: |
-        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log || true
+        # Ollama logs (if ollama container exists)
        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log 2>&1 || true
        # Note: distro container logs are now dumped in integration-tests.sh before container is removed
    - name: Upload logs
      if: ${{ always() }}
--- a/.github/actions/setup-test-environment/action.yml
+++ b/.github/actions/setup-test-environment/action.yml
@ -57,7 +57,7 @@ runs:
        echo "Building Llama Stack"
        LLAMA_STACK_DIR=. \
-          uv run --no-sync llama stack build --template ci-tests --image-type venv
+          uv run --no-sync llama stack list-deps ci-tests | xargs -L1 uv pip install
    - name: Configure git for commits
      shell: bash
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -12,7 +12,9 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suites from tests/integration in replay mode |
 | Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers |
 | Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
 | Pre-commit Bot | [precommit-trigger.yml](precommit-trigger.yml) | Pre-commit bot for PR |
 | Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
 | Test llama stack list-deps | [providers-list-deps.yml](providers-list-deps.yml) | Test llama stack list-deps |
 | Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
 | Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration |
 | Check semantic PR titles | [semantic-pr.yml](semantic-pr.yml) | Ensure that PR titles follow the conventional commit spec |
@ -21,4 +23,3 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |
 | UI Tests | [ui-unit-tests.yml](ui-unit-tests.yml) | Run the UI test suite |
 | Unit Tests | [unit-tests.yml](unit-tests.yml) | Run the unit test suite |
 | Update ReadTheDocs | [update-readthedocs.yml](update-readthedocs.yml) | Update the Llama Stack ReadTheDocs site |
--- a/.github/workflows/conformance.yml
+++ b/.github/workflows/conformance.yml
@ -1,6 +1,11 @@
 # API Conformance Tests
 # This workflow ensures that API changes maintain backward compatibility and don't break existing integrations
 # It runs schema validation and OpenAPI diff checks to catch breaking changes early
 #
 # The workflow handles both monolithic and split API specifications:
 # - If split specs exist (stable/experimental/deprecated), they are stitched together for comparison
 # - If only monolithic spec exists, it is used directly
 # This allows for clean API organization while maintaining robust conformance testing
 name: API Conformance Tests
@ -11,10 +16,13 @@ on:
    branches: [ main ]
  pull_request:
    branches: [ main ]
-    types: [opened, synchronize, reopened]
+    types: [opened, synchronize, reopened, edited]
    paths:
-      - 'docs/_static/llama-stack-spec.yaml'
+      - 'docs/static/llama-stack-spec.yaml'              # Legacy monolithic spec
-      - 'docs/_static/llama-stack-spec.html'
+      - 'docs/static/stable-llama-stack-spec.yaml'       # Stable APIs spec
      - 'docs/static/experimental-llama-stack-spec.yaml' # Experimental APIs spec
      - 'docs/static/deprecated-llama-stack-spec.yaml'   # Deprecated APIs spec
      - 'docs/static/llama-stack-spec.html'              # Legacy HTML spec
      - '.github/workflows/conformance.yml'              # This workflow itself
 concurrency:
@ -27,14 +35,31 @@ jobs:
  check-schema-compatibility:
    runs-on: ubuntu-latest
    steps:
      # Using specific version 4.1.7 because 5.0.0 fails when trying to run this locally using `act`
      # This ensures consistent behavior between local testing and CI
      - name: Checkout PR Code
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0
      # Check if we should skip conformance testing due to breaking changes
      - name: Check if conformance test should be skipped
        id: skip-check
        env:
          PR_TITLE: ${{ github.event.pull_request.title }}
        run: |
          # Skip if title contains "!:" indicating breaking change (like "feat!:")
          if [[ "$PR_TITLE" == *"!:"* ]]; then
            echo "skip=true" >> $GITHUB_OUTPUT
            exit 0
          fi
          # Get all commits in this PR and check for BREAKING CHANGE footer
          git log --format="%B" ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }} | \
            grep -q "BREAKING CHANGE:" && echo "skip=true" >> $GITHUB_OUTPUT || echo "skip=false" >> $GITHUB_OUTPUT
        shell: bash
      # Checkout the base branch to compare against (usually main)
      # This allows us to diff the current changes against the previous state
      - name: Checkout Base Branch
        if: steps.skip-check.outputs.skip != 'true'
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          ref: ${{ github.event.pull_request.base.ref }}
@ -42,30 +67,78 @@ jobs:
      # Cache oasdiff to avoid checksum failures and speed up builds
      - name: Cache oasdiff
        if: steps.skip-check.outputs.skip != 'true'
        id: cache-oasdiff
-        uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809
+        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830
        with:
          path: ~/oasdiff
          key: oasdiff-${{ runner.os }}
      # Install oasdiff: https://github.com/oasdiff/oasdiff, a tool for detecting breaking changes in OpenAPI specs.
      - name: Install oasdiff
-        if: steps.cache-oasdiff.outputs.cache-hit != 'true'
+        if: steps.skip-check.outputs.skip != 'true' && steps.cache-oasdiff.outputs.cache-hit != 'true'
        run: |
          curl -fsSL https://raw.githubusercontent.com/oasdiff/oasdiff/main/install.sh | sh
          cp /usr/local/bin/oasdiff ~/oasdiff
      # Setup cached oasdiff
      - name: Setup cached oasdiff
-        if: steps.cache-oasdiff.outputs.cache-hit == 'true'
+        if: steps.skip-check.outputs.skip != 'true' && steps.cache-oasdiff.outputs.cache-hit == 'true'
        run: |
          sudo cp ~/oasdiff /usr/local/bin/oasdiff
          sudo chmod +x /usr/local/bin/oasdiff
      # Install yq for YAML processing
      - name: Install yq
        run: |
          sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64
          sudo chmod +x /usr/local/bin/yq
      # Verify API specs exist for conformance testing
      - name: Check API Specs
        if: steps.skip-check.outputs.skip != 'true'
        run: |
          echo "Checking for API specification files..."
          # Check current branch
          if [ -f "docs/static/stable-llama-stack-spec.yaml" ]; then
            echo "✓ Found stable API spec in current branch"
            CURRENT_SPEC="docs/static/stable-llama-stack-spec.yaml"
          elif [ -f "docs/static/llama-stack-spec.yaml" ]; then
            echo "✓ Found monolithic API spec in current branch"
            CURRENT_SPEC="docs/static/llama-stack-spec.yaml"
          else
            echo "❌ No API specs found in current branch"
            exit 1
          fi
          # Check base branch
          if [ -f "base/docs/static/stable-llama-stack-spec.yaml" ]; then
            echo "✓ Found stable API spec in base branch"
            BASE_SPEC="base/docs/static/stable-llama-stack-spec.yaml"
          elif [ -f "base/docs/static/llama-stack-spec.yaml" ]; then
            echo "✓ Found monolithic API spec in base branch"
            BASE_SPEC="base/docs/static/llama-stack-spec.yaml"
          else
            echo "❌ No API specs found in base branch"
            exit 1
          fi
          # Export for next step
          echo "BASE_SPEC=${BASE_SPEC}" >> $GITHUB_ENV
          echo "CURRENT_SPEC=${CURRENT_SPEC}" >> $GITHUB_ENV
          echo "Will compare: ${BASE_SPEC} -> ${CURRENT_SPEC}"
      # Run oasdiff to detect breaking changes in the API specification
      # This step will fail if incompatible changes are detected, preventing breaking changes from being merged
      - name: Run OpenAPI Breaking Change Diff
        if: steps.skip-check.outputs.skip != 'true'
        run: |
-          oasdiff breaking --fail-on ERR base/docs/_static/llama-stack-spec.yaml docs/_static/llama-stack-spec.yaml --match-path '^/v1/openai/v1' \
+          oasdiff breaking --fail-on ERR $BASE_SPEC $CURRENT_SPEC --match-path '^/v1/'
-          --match-path '^/v1/vector-io' \
+
-          --match-path '^/v1/vector-dbs'
+      # Report when test is skipped
      - name: Report skip reason
        if: steps.skip-check.outputs.skip == 'true'
        run: |
          echo "Conformance test skipped due to breaking change indicator"
--- a/.github/workflows/install-script-ci.yml
+++ b/.github/workflows/install-script-ci.yml
@ -30,8 +30,11 @@ jobs:
      - name: Build a single provider
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync \
+          docker build . \
-            llama stack build --template starter --image-type container --image-name test
+            -f containers/Containerfile \
            --build-arg INSTALL_MODE=editable \
            --build-arg DISTRO_NAME=starter \
            --tag llama-stack:starter-ci
      - name: Run installer end-to-end
        run: |
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -73,6 +73,24 @@ jobs:
          image_name: kube
          apis: []
          providers: {}
          storage:
            backends:
              kv_default:
                type: kv_sqlite
                db_path: $run_dir/kvstore.db
              sql_default:
                type: sql_sqlite
                db_path: $run_dir/sql_store.db
            stores:
              metadata:
                namespace: registry
                backend: kv_default
              inference:
                table_name: inference_store
                backend: sql_default
              conversations:
                table_name: openai_conversations
                backend: sql_default
          server:
            port: 8321
          EOF
@ -84,13 +102,16 @@ jobs:
          yq eval '.server.auth.provider_config.jwks.token = "${{ env.TOKEN }}"' -i $run_dir/run.yaml
          cat $run_dir/run.yaml
-          nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &
+          # avoid line breaks in the server log, especially because we grep it below.
          export LLAMA_STACK_LOG_WIDTH=200
          nohup uv run llama stack run $run_dir/run.yaml > server.log 2>&1 &
      - name: Wait for Llama Stack server to be ready
        run: |
          echo "Waiting for Llama Stack server..."
          for i in {1..30}; do
-            if curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://localhost:8321/v1/health | grep -q "OK"; then
+            # Note: /v1/health does not require authentication
            if curl -s -L http://localhost:8321/v1/health | grep -q "OK"; then
              echo "Llama Stack server is up!"
              if grep -q "Enabling authentication with provider: ${{ matrix.auth-provider }}" server.log; then
                echo "Llama Stack server is configured to use ${{ matrix.auth-provider }} auth"
@ -109,4 +130,27 @@ jobs:
      - name: Test auth
        run: |
          echo "Testing /v1/version without token (should succeed)..."
          if curl -s -L -o /dev/null -w "%{http_code}" http://127.0.0.1:8321/v1/version | grep -q "200"; then
            echo "/v1/version accessible without token (200)"
          else
            echo "/v1/version returned non-200 status without token"
            exit 1
          fi
          echo "Testing /v1/providers without token (should fail with 401)..."
          if curl -s -L -o /dev/null -w "%{http_code}" http://127.0.0.1:8321/v1/providers | grep -q "401"; then
            echo "/v1/providers blocked without token (401)"
          else
            echo "/v1/providers did not return 401 without token"
            exit 1
          fi
          echo "Testing /v1/providers with valid token (should succeed)..."
          curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers | jq
          if [ $? -eq 0 ]; then
            echo "/v1/providers accessible with valid token"
          else
            echo "/v1/providers failed with valid token"
            exit 1
          fi
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -42,18 +42,27 @@ jobs:
  run-replay-mode-tests:
    runs-on: ubuntu-latest
-    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.setup, matrix.python-version, matrix.client-version, matrix.suite) }}
+    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.config.setup, matrix.python-version, matrix.client-version, matrix.config.suite) }}
    strategy:
      fail-fast: false
      matrix:
-        client-type: [library, server]
+        client-type: [library, docker]
        # Use vllm on weekly schedule, otherwise use test-setup input (defaults to ollama)
        setup: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-setup || 'ollama')) }}
        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
-        suite: [base, vision]
+        # Define (setup, suite) pairs - they are always matched and cannot be independent
        # Weekly schedule (Sun 1 AM): vllm+base
        # Input test-setup=ollama-vision: ollama-vision+vision
        # Default (including test-setup=ollama): ollama+base, ollama-vision+vision, gpt+responses
        config: >-
          ${{
            github.event.schedule == '1 0 * * 0'
              && fromJSON('[{"setup": "vllm", "suite": "base"}]')
            || github.event.inputs.test-setup == 'ollama-vision'
              && fromJSON('[{"setup": "ollama-vision", "suite": "vision"}]')
            || fromJSON('[{"setup": "ollama", "suite": "base"}, {"setup": "ollama-vision", "suite": "vision"}, {"setup": "gpt", "suite": "responses"}]')
          }}
    steps:
      - name: Checkout repository
@ -64,14 +73,16 @@ jobs:
        with:
          python-version: ${{ matrix.python-version }}
          client-version: ${{ matrix.client-version }}
-          setup: ${{ matrix.setup }}
+          setup: ${{ matrix.config.setup }}
-          suite: ${{ matrix.suite }}
+          suite: ${{ matrix.config.suite }}
          inference-mode: 'replay'
      - name: Run tests
        uses: ./.github/actions/run-and-record-tests
        env:
          OPENAI_API_KEY: dummy
        with:
-          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
+          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || matrix.client-type == 'server' && 'server:ci-tests' || 'docker:ci-tests' }}
-          setup: ${{ matrix.setup }}
+          setup: ${{ matrix.config.setup }}
          inference-mode: 'replay'
-          suite: ${{ matrix.suite }}
+          suite: ${{ matrix.config.suite }}
--- a/.github/workflows/integration-vector-io-tests.yml
+++ b/.github/workflows/integration-vector-io-tests.yml
@ -144,7 +144,7 @@ jobs:
      - name: Build Llama Stack
        run: |
-          uv run --no-sync llama stack build --template ci-tests --image-type venv
+          uv run --no-sync llama stack list-deps ci-tests | xargs -L1 uv pip install
      - name: Check Storage and Memory Available Before Tests
        if: ${{ always() }}
@ -169,8 +169,7 @@ jobs:
        run: |
          uv run --no-sync \
            pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
-            tests/integration/vector_io \
+            tests/integration/vector_io
            --embedding-model inline::sentence-transformers/all-MiniLM-L6-v2
      - name: Check Storage and Memory Available After Tests
        if: ${{ always() }}
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -37,7 +37,7 @@ jobs:
            .pre-commit-config.yaml
      - name: Set up Node.js
-        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
+        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
        with:
          node-version: '20'
          cache: 'npm'
--- a/.github/workflows/precommit-trigger.yml
+++ b/.github/workflows/precommit-trigger.yml
@ -0,0 +1,227 @@
 name: Pre-commit Bot
 run-name: Pre-commit bot for PR #${{ github.event.issue.number }}
 on:
  issue_comment:
    types: [created]
 jobs:
  pre-commit:
    # Only run on pull request comments
    if: github.event.issue.pull_request && contains(github.event.comment.body, '@github-actions run precommit')
    runs-on: ubuntu-latest
    permissions:
      contents: write
      pull-requests: write
    steps:
      - name: Check comment author and get PR details
        id: check_author
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          script: |
            // Get PR details
            const pr = await github.rest.pulls.get({
              owner: context.repo.owner,
              repo: context.repo.repo,
              pull_number: context.issue.number
            });
            // Check if commenter has write access or is the PR author
            const commenter = context.payload.comment.user.login;
            const prAuthor = pr.data.user.login;
            let hasPermission = false;
            // Check if commenter is PR author
            if (commenter === prAuthor) {
              hasPermission = true;
              console.log(`Comment author ${commenter} is the PR author`);
            } else {
              // Check if commenter has write/admin access
              try {
                const permission = await github.rest.repos.getCollaboratorPermissionLevel({
                  owner: context.repo.owner,
                  repo: context.repo.repo,
                  username: commenter
                });
                const level = permission.data.permission;
                hasPermission = ['write', 'admin', 'maintain'].includes(level);
                console.log(`Comment author ${commenter} has permission: ${level}`);
              } catch (error) {
                console.log(`Could not check permissions for ${commenter}: ${error.message}`);
              }
            }
            if (!hasPermission) {
              await github.rest.issues.createComment({
                owner: context.repo.owner,
                repo: context.repo.repo,
                issue_number: context.issue.number,
                body: `❌ @${commenter} You don't have permission to trigger pre-commit. Only PR authors or repository collaborators can run this command.`
              });
              core.setFailed(`User ${commenter} does not have permission`);
              return;
            }
            // Save PR info for later steps
            core.setOutput('pr_number', context.issue.number);
            core.setOutput('pr_head_ref', pr.data.head.ref);
            core.setOutput('pr_head_sha', pr.data.head.sha);
            core.setOutput('pr_head_repo', pr.data.head.repo.full_name);
            core.setOutput('pr_base_ref', pr.data.base.ref);
            core.setOutput('is_fork', pr.data.head.repo.full_name !== context.payload.repository.full_name);
            core.setOutput('authorized', 'true');
      - name: React to comment
        if: steps.check_author.outputs.authorized == 'true'
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          script: |
            await github.rest.reactions.createForIssueComment({
              owner: context.repo.owner,
              repo: context.repo.repo,
              comment_id: context.payload.comment.id,
              content: 'rocket'
            });
      - name: Comment starting
        if: steps.check_author.outputs.authorized == 'true'
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          script: |
            await github.rest.issues.createComment({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: ${{ steps.check_author.outputs.pr_number }},
              body: `⏳ Running [pre-commit hooks](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}) on PR #${{ steps.check_author.outputs.pr_number }}...`
            });
      - name: Checkout PR branch (same-repo)
        if: steps.check_author.outputs.authorized == 'true' && steps.check_author.outputs.is_fork == 'false'
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          ref: ${{ steps.check_author.outputs.pr_head_ref }}
          fetch-depth: 0
          token: ${{ secrets.GITHUB_TOKEN }}
      - name: Checkout PR branch (fork)
        if: steps.check_author.outputs.authorized == 'true' && steps.check_author.outputs.is_fork == 'true'
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          repository: ${{ steps.check_author.outputs.pr_head_repo }}
          ref: ${{ steps.check_author.outputs.pr_head_ref }}
          fetch-depth: 0
          token: ${{ secrets.GITHUB_TOKEN }}
      - name: Verify checkout
        if: steps.check_author.outputs.authorized == 'true'
        run: |
          echo "Current SHA: $(git rev-parse HEAD)"
          echo "Expected SHA: ${{ steps.check_author.outputs.pr_head_sha }}"
          if [[ "$(git rev-parse HEAD)" != "${{ steps.check_author.outputs.pr_head_sha }}" ]]; then
            echo "::error::Checked out SHA does not match expected SHA"
            exit 1
          fi
      - name: Set up Python
        if: steps.check_author.outputs.authorized == 'true'
        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
        with:
          python-version: '3.12'
          cache: pip
          cache-dependency-path: |
            **/requirements*.txt
            .pre-commit-config.yaml
      - name: Set up Node.js
        if: steps.check_author.outputs.authorized == 'true'
        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
        with:
          node-version: '20'
          cache: 'npm'
          cache-dependency-path: 'llama_stack/ui/'
      - name: Install npm dependencies
        if: steps.check_author.outputs.authorized == 'true'
        run: npm ci
        working-directory: llama_stack/ui
      - name: Run pre-commit
        if: steps.check_author.outputs.authorized == 'true'
        id: precommit
        uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
        continue-on-error: true
        env:
          SKIP: no-commit-to-branch
          RUFF_OUTPUT_FORMAT: github
      - name: Check for changes
        if: steps.check_author.outputs.authorized == 'true'
        id: changes
        run: |
          if ! git diff --exit-code || [ -n "$(git ls-files --others --exclude-standard)" ]; then
            echo "has_changes=true" >> $GITHUB_OUTPUT
            echo "Changes detected after pre-commit"
          else
            echo "has_changes=false" >> $GITHUB_OUTPUT
            echo "No changes after pre-commit"
          fi
      - name: Commit and push changes
        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'true'
        run: |
          git config --local user.email "github-actions[bot]@users.noreply.github.com"
          git config --local user.name "github-actions[bot]"
          git add -A
          git commit -m "style: apply pre-commit fixes
          🤖 Applied by @github-actions bot via pre-commit workflow"
          # Push changes
          git push origin HEAD:${{ steps.check_author.outputs.pr_head_ref }}
      - name: Comment success with changes
        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'true'
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          script: |
            await github.rest.issues.createComment({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: ${{ steps.check_author.outputs.pr_number }},
              body: `✅ Pre-commit hooks completed successfully!\n\n🔧 Changes have been committed and pushed to the PR branch.`
            });
      - name: Comment success without changes
        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'false' && steps.precommit.outcome == 'success'
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          script: |
            await github.rest.issues.createComment({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: ${{ steps.check_author.outputs.pr_number }},
              body: `✅ Pre-commit hooks passed!\n\n✨ No changes needed - your code is already formatted correctly.`
            });
      - name: Comment failure
        if: failure()
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          script: |
            await github.rest.issues.createComment({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: ${{ steps.check_author.outputs.pr_number }},
              body: `❌ Pre-commit workflow failed!\n\nPlease check the [workflow logs](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}) for details.`
            });
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -14,6 +14,8 @@ on:
      - '.github/workflows/providers-build.yml'
      - 'llama_stack/distributions/**'
      - 'pyproject.toml'
      - 'containers/Containerfile'
      - '.dockerignore'
  pull_request:
    paths:
@ -24,6 +26,8 @@ on:
      - '.github/workflows/providers-build.yml'
      - 'llama_stack/distributions/**'
      - 'pyproject.toml'
      - 'containers/Containerfile'
      - '.dockerignore'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
@ -60,15 +64,19 @@ jobs:
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
-      - name: Print build dependencies
+      - name: Install distribution into venv
        if: matrix.image-type == 'venv'
        run: |
-          uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
+          uv run llama stack list-deps ${{ matrix.distro }} | xargs -L1 uv pip install
-      - name: Run Llama Stack Build
+      - name: Build container image
        if: matrix.image-type == 'container'
        run: |
-          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
+          docker build . \
-          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
+            -f containers/Containerfile \
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test
+            --build-arg INSTALL_MODE=editable \
            --build-arg DISTRO_NAME=${{ matrix.distro }} \
            --tag llama-stack:${{ matrix.distro }}-ci
      - name: Print dependencies in the image
        if: matrix.image-type == 'venv'
@ -86,8 +94,8 @@ jobs:
      - name: Build a single provider
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --image-type venv --image-name test --providers inference=remote::ollama
+          uv pip install -e .
-
+          uv run --no-sync llama stack list-deps --providers inference=remote::ollama | xargs -L1 uv pip install
  build-custom-container-distribution:
    runs-on: ubuntu-latest
    steps:
@ -97,11 +105,16 @@ jobs:
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
-      - name: Build a single provider
+      - name: Build container image
        run: |
-          yq -i '.image_type = "container"' llama_stack/distributions/ci-tests/build.yaml
+          BASE_IMAGE=$(yq -r '.distribution_spec.container_image // "python:3.12-slim"' llama_stack/distributions/ci-tests/build.yaml)
-          yq -i '.image_name = "test"' llama_stack/distributions/ci-tests/build.yaml
+          docker build . \
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml
+            -f containers/Containerfile \
            --build-arg INSTALL_MODE=editable \
            --build-arg DISTRO_NAME=ci-tests \
            --build-arg BASE_IMAGE="$BASE_IMAGE" \
            --build-arg RUN_CONFIG_PATH=/workspace/llama_stack/distributions/ci-tests/run.yaml \
            -t llama-stack:ci-tests
      - name: Inspect the container image entrypoint
        run: |
@ -112,7 +125,7 @@ jobs:
          fi
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
+          if [ "$entrypoint" != "[/usr/local/bin/llama-stack-entrypoint.sh]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
@ -129,17 +142,19 @@ jobs:
      - name: Pin distribution to UBI9 base
        run: |
          yq -i '
            .image_type    = "container" |
            .image_name    = "ubi9-test" |
            .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
          ' llama_stack/distributions/ci-tests/build.yaml
-      - name: Build dev container (UBI9)
+      - name: Build UBI9 container image
        env:
          USE_COPY_NOT_MOUNT: "true"
          LLAMA_STACK_DIR: "."
        run: |
-          uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml
+          BASE_IMAGE=$(yq -r '.distribution_spec.container_image // "registry.access.redhat.com/ubi9:latest"' llama_stack/distributions/ci-tests/build.yaml)
          docker build . \
            -f containers/Containerfile \
            --build-arg INSTALL_MODE=editable \
            --build-arg DISTRO_NAME=ci-tests \
            --build-arg BASE_IMAGE="$BASE_IMAGE" \
            --build-arg RUN_CONFIG_PATH=/workspace/llama_stack/distributions/ci-tests/run.yaml \
            -t llama-stack:ci-tests-ubi9
      - name: Inspect UBI9 image
        run: |
@ -150,7 +165,7 @@ jobs:
          fi
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
+          if [ "$entrypoint" != "[/usr/local/bin/llama-stack-entrypoint.sh]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
--- a/.github/workflows/providers-list-deps.yml
+++ b/.github/workflows/providers-list-deps.yml
@ -0,0 +1,105 @@
 name: Test llama stack list-deps
 run-name: Test llama stack list-deps
 on:
  push:
    branches:
      - main
    paths:
      - 'llama_stack/cli/stack/list_deps.py'
      - 'llama_stack/cli/stack/_list_deps.py'
      - 'llama_stack/core/build.*'
      - 'llama_stack/core/*.sh'
      - '.github/workflows/providers-list-deps.yml'
      - 'llama_stack/templates/**'
      - 'pyproject.toml'
  pull_request:
    paths:
      - 'llama_stack/cli/stack/list_deps.py'
      - 'llama_stack/cli/stack/_list_deps.py'
      - 'llama_stack/core/build.*'
      - 'llama_stack/core/*.sh'
      - '.github/workflows/providers-list-deps.yml'
      - 'llama_stack/templates/**'
      - 'pyproject.toml'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  generate-matrix:
    runs-on: ubuntu-latest
    outputs:
      distros: ${{ steps.set-matrix.outputs.distros }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Generate Distribution List
        id: set-matrix
        run: |
          distros=$(ls llama_stack/distributions/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
          echo "distros=$distros" >> "$GITHUB_OUTPUT"
  list-deps:
    needs: generate-matrix
    runs-on: ubuntu-latest
    strategy:
      matrix:
        distro: ${{ fromJson(needs.generate-matrix.outputs.distros) }}
        image-type: [venv, container]
      fail-fast: false # We want to run all jobs even if some fail
    steps:
      - name: Checkout repository
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Print dependencies
        run: |
          uv run llama stack list-deps ${{ matrix.distro }}
      - name: Install Distro using llama stack list-deps
        run: |
          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack list-deps ${{ matrix.distro }} | xargs -L1 uv pip install
      - name: Print dependencies in the image
        if: matrix.image-type == 'venv'
        run: |
          uv pip list
  show-single-provider:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Show a single provider
        run: |
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack list-deps --providers inference=remote::ollama
  list-deps-from-config:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: list-des from Config
        env:
          USE_COPY_NOT_MOUNT: "true"
          LLAMA_STACK_DIR: "."
        run: |
          uv run llama stack list-deps llama_stack/distributions/ci-tests/build.yaml
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -24,7 +24,7 @@ jobs:
      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
    - name: Install uv
-      uses: astral-sh/setup-uv@b75a909f75acd358c2196fb9a5f1299a9a8868a4 # v6.7.0
+      uses: astral-sh/setup-uv@3259c6206f993105e3a61b142c2d97bf4b9ef83d # v7.1.0
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
@ -43,7 +43,5 @@ jobs:
        uv pip list
        uv pip show llama-stack
        command -v llama
        llama model prompt-format -m Llama3.2-90B-Vision-Instruct
        llama model list
        llama stack list-apis
        llama stack list-providers inference
--- a/.github/workflows/record-integration-tests.yml
+++ b/.github/workflows/record-integration-tests.yml
@ -61,6 +61,9 @@ jobs:
      - name: Run and record tests
        uses: ./.github/actions/run-and-record-tests
        env:
          # Set OPENAI_API_KEY if using gpt setup
          OPENAI_API_KEY: ${{ inputs.test-setup == 'gpt' && secrets.OPENAI_API_KEY || '' }}
        with:
          stack-config: 'server:ci-tests'  # recording must be done with server since more tests are run
          setup: ${{ inputs.test-setup || 'ollama' }}
--- a/.github/workflows/stale_bot.yml
+++ b/.github/workflows/stale_bot.yml
@ -24,7 +24,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Stale Action
-        uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
+        uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
        with:
          stale-issue-label: 'stale'
          stale-issue-message: >
--- a/.github/workflows/test-external-provider-module.yml
+++ b/.github/workflows/test-external-provider-module.yml
@ -46,9 +46,9 @@ jobs:
          yq -i '.image_type = "${{ matrix.image-type }}"' tests/external/ramalama-stack/run.yaml
          cat tests/external/ramalama-stack/run.yaml
-      - name: Build distro from config file
+      - name: Install distribution dependencies
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/ramalama-stack/build.yaml
+          uv run llama stack list-deps tests/external/ramalama-stack/build.yaml | xargs -L1 uv pip install
      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
@ -59,7 +59,7 @@ jobs:
          # Use the virtual environment created by the build step (name comes from build config)
          source ramalama-stack-test/bin/activate
          uv pip list
-          nohup llama stack run tests/external/ramalama-stack/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
+          nohup llama stack run tests/external/ramalama-stack/run.yaml > server.log 2>&1 &
      - name: Wait for Llama Stack server to be ready
        run: |
--- a/.github/workflows/test-external.yml
+++ b/.github/workflows/test-external.yml
@ -44,11 +44,14 @@ jobs:
      - name: Print distro dependencies
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml --print-deps-only
+          uv run --no-sync llama stack list-deps tests/external/build.yaml
      - name: Build distro from config file
        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml
+          uv venv ci-test
          source ci-test/bin/activate
          uv pip install -e .
          LLAMA_STACK_LOGGING=all=CRITICAL llama stack list-deps tests/external/build.yaml | xargs -L1 uv pip install
      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
@ -59,7 +62,7 @@ jobs:
          # Use the virtual environment created by the build step (name comes from build config)
          source ci-test/bin/activate
          uv pip list
-          nohup llama stack run tests/external/run-byoa.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
+          nohup llama stack run tests/external/run-byoa.yaml > server.log 2>&1 &
      - name: Wait for Llama Stack server to be ready
        run: |
--- a/.github/workflows/ui-unit-tests.yml
+++ b/.github/workflows/ui-unit-tests.yml
@ -29,7 +29,7 @@ jobs:
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Setup Node.js
-        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
+        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
        with:
          node-version: ${{ matrix.node-version }}
          cache: 'npm'
--- a/.github/workflows/update-readthedocs.yml
+++ b/.github/workflows/update-readthedocs.yml
@ -1,70 +0,0 @@
 name: Update ReadTheDocs
 run-name: Update the Llama Stack ReadTheDocs site
 on:
  workflow_dispatch:
    inputs:
      branch:
        description: 'RTD version to update'
        required: false
        default: 'latest'
  push:
    branches:
      - main
    paths:
      - 'docs/**'
      - 'pyproject.toml'
      - '.github/workflows/update-readthedocs.yml'
    tags:
      - '*'
  pull_request:
    branches:
      - main
    paths:
      - 'docs/**'
      - 'pyproject.toml'
      - '.github/workflows/update-readthedocs.yml'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
  cancel-in-progress: true
 jobs:
  update-readthedocs:
    runs-on: ubuntu-latest
    env:
      TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Build HTML
        run: |
          cd docs
          uv run make html
      - name: Trigger ReadTheDocs build
        if: github.event_name != 'pull_request'
        run: |
          if [ -z "$TOKEN" ]; then
            echo "READTHEDOCS_TOKEN is not set"
            exit 1
          fi
          response=$(curl -X POST \
            -H "Content-Type: application/json" \
            -d "{
              \"token\": \"$TOKEN\",
              \"version\": \"$GITHUB_REF_NAME\"
            }" \
            https://readthedocs.org/api/v2/webhook/llama-stack/289768/)
          echo "Response: $response"
          if [ $(echo $response | jq -r '.build_triggered') != 'true' ]; then
            echo "Failed to trigger ReadTheDocs build"
            exit 1
          fi
--- a/.gitignore
+++ b/.gitignore
@ -18,7 +18,6 @@ Package.resolved
 .venv/
 .vscode
 _build
 docs/src
 # Sample tool-calling datasets generated by NVIDIA notebooks
 docs/notebooks/nvidia/tool_calling/sample_data/
 pyrightconfig.json
@ -30,3 +29,6 @@ AGENTS.md
 server.log
 CLAUDE.md
 .claude/
 docs/.docusaurus/
 docs/node_modules/
 docs/static/imported-files/
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -1,25 +0,0 @@
 # .readthedocs.yaml
 # Read the Docs configuration file
 # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 # Required
 version: 2
 # Build documentation in the "docs/" directory with Sphinx
 sphinx:
  configuration: docs/source/conf.py
 # Set the OS, Python version and other tools you might need
 build:
  os: ubuntu-22.04
  tools:
    python: "3.12"
  jobs:
    pre_create_environment:
      - asdf plugin add uv
      - asdf install uv latest
      - asdf global uv latest
    create_environment:
      - uv venv "${READTHEDOCS_VIRTUALENV_PATH}"
    install:
      - UV_PROJECT_ENVIRONMENT="${READTHEDOCS_VIRTUALENV_PATH}" uv sync --frozen --group docs
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -11,14 +11,17 @@ You can install the dependencies by running:
 ```bash
 cd llama-stack
 uv venv --python 3.12
 uv sync --group dev
 uv pip install -e .
 source .venv/bin/activate
 ```
 ```{note}
-You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.12`).
+If you are making changes to Llama Stack, it is essential that you use Python 3.12 as shown above.
-Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
+Llama Stack can work with Python 3.13 but the pre-commit hooks used to validate code changes only work with Python 3.12.
 If you don't specify a Python version, `uv` will automatically select a Python version according to the `requires-python`
 section of the `pyproject.toml`, which is fine for running Llama Stack but not for committing changes.
 For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
 ```
@ -42,17 +45,22 @@ uv run --env-file .env -- pytest -v tests/integration/inference/test_text_infere
 We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
 ```bash
 uv pip install pre-commit==4.3.0
 uv run pre-commit install
 ```
-After that, pre-commit hooks will run automatically before each commit.
+Note that the only version of pre-commit that works with the Llama Stack continuous integration is `4.3.0` so it is essential that you pull
 that specific version as shown above.  Once you have run these commands, pre-commit hooks will run automatically before each commit.
-Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
+Alternatively, if you don't want to install the pre-commit hooks (or if you want to check if your changes are ready before committing),
 you can run the checks manually by running:
 ```bash
-uv run pre-commit run --all-files
+uv run pre-commit run --all-files -v
 ```
 The `-v` (verbose) parameter is optional but often helpful for getting more information about any issues with that the pre-commit checks identify.
 ```{caution}
 Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
 ```
@ -61,7 +69,7 @@ Before pushing your changes, make sure that the pre-commit hooks have passed suc
 We actively welcome your pull requests. However, please read the following. This is heavily inspired by [Ghostty](https://github.com/ghostty-org/ghostty/blob/main/CONTRIBUTING.md).
-If in doubt, please open a [discussion](https://github.com/meta-llama/llama-stack/discussions); we can always convert that to an issue later.
+If in doubt, please open a [discussion](https://github.com/llamastack/llama-stack/discussions); we can always convert that to an issue later.
 ### Issues
 We use GitHub issues to track public bugs. Please ensure your description is
@ -83,6 +91,7 @@ If you are new to the project, start by looking at the issues tagged with "good
 leave a comment on the issue and a triager will assign it to you.
 Please avoid picking up too many issues at once. This helps you stay focused and ensures that others in the community also have opportunities to contribute.
 - Try to work on only 1–2 issues at a time, especially if you’re still getting familiar with the codebase.
 - Before taking an issue, check if it’s already assigned or being actively discussed.
 - If you’re blocked or can’t continue with an issue, feel free to unassign yourself or leave a comment so others can step in.
@ -158,17 +167,22 @@ under the LICENSE file in the root directory of this source tree.
 Some tips about common tasks you work on while contributing to Llama Stack:
-### Using `llama stack build`
+### Installing dependencies of distributions
-Building a stack image will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
+When installing dependencies for a distribution, you can use `llama stack list-deps` to view and install the required packages.
 Example:
 ```bash
 cd work/
-git clone https://github.com/meta-llama/llama-stack.git
+git clone https://github.com/llamastack/llama-stack.git
-git clone https://github.com/meta-llama/llama-stack-client-python.git
+git clone https://github.com/llamastack/llama-stack-client-python.git
 cd llama-stack
-LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --distro <...>
+
 # Show dependencies for a distribution
 llama stack list-deps <distro-name>
 # Install dependencies
 llama stack list-deps <distro-name> | xargs -L1 uv pip install
 ```
 ### Updating distribution configurations
@ -187,14 +201,17 @@ Note that the provider "description" field will be used to generate the provider
 ### Building the Documentation
-If you are making changes to the documentation at [https://llamastack.github.io/latest/](https://llamastack.github.io/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.
+If you are making changes to the documentation at [https://llamastack.github.io/](https://llamastack.github.io/), you can use the following command to build the documentation and preview your changes.
 ```bash
-# This rebuilds the documentation pages.
+# This rebuilds the documentation pages and the OpenAPI spec.
-uv run --group docs make -C docs/ html
+cd docs/
 npm install
 npm run gen-api-docs all
 npm run build
-# This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
+# This will start a local server (usually at http://127.0.0.1:3000).
-uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
+npm run serve
 ```
 ### Update API Documentation
@ -205,4 +222,4 @@ If you modify or add new API endpoints, update the API documentation accordingly
 uv run ./docs/openapi_generator/run_openapi_generator.sh
 ```
-The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.
+The generated API schema will be available in `docs/static/`. Make sure to review the changes before committing.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -4,6 +4,8 @@ include llama_stack/models/llama/llama4/tokenizer.model
 include llama_stack/core/*.sh
 include llama_stack/cli/scripts/*.sh
 include llama_stack/distributions/*/*.yaml
-include llama_stack/providers/tests/test_cases/inference/*.json
+exclude llama_stack/distributions/ci-tests
 include tests/integration/test_cases/inference/*.json
 include llama_stack/models/llama/*/*.md
 include llama_stack/tests/integration/*.jpg
 prune llama_stack/distributions/ci-tests
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@
 [![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
 [![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)
-[**Quick Start**](https://llamastack.github.io/latest/getting_started/index.html) | [**Documentation**](https://llamastack.github.io/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
+[**Quick Start**](https://llamastack.github.io/docs/getting_started/quickstart) | [**Documentation**](https://llamastack.github.io/docs) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
 ### ✨🎉 Llama 4 Support  🎉✨
@ -25,10 +25,13 @@ pip install -U llama_stack
 MODEL="Llama-4-Scout-17B-16E-Instruct"
 # get meta url from llama.com
-llama model download --source meta --model-id $MODEL --meta-url <META_URL>
+huggingface-cli download meta-llama/$MODEL --local-dir ~/.llama/$MODEL
 # install dependencies for the distribution
 llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
 # start a llama stack server
-INFERENCE_MODEL=meta-llama/$MODEL llama stack build --run --template meta-reference-gpu
+INFERENCE_MODEL=meta-llama/$MODEL llama stack run meta-reference-gpu
 # install client to interact with the server
 pip install llama-stack-client
@ -43,10 +46,21 @@ inference chat-completion \
 --model-id meta-llama/$MODEL \
 --message "write a haiku for meta's llama 4 models"
-ChatCompletionResponse(
+OpenAIChatCompletion(
-    completion_message=CompletionMessage(content="Whispers in code born\nLlama's gentle, wise heartbeat\nFuture's soft unfold", role='assistant', stop_reason='end_of_turn', tool_calls=[]),
+    ...
-    logprobs=None,
+    choices=[
-    metrics=[Metric(metric='prompt_tokens', value=21.0, unit=None), Metric(metric='completion_tokens', value=28.0, unit=None), Metric(metric='total_tokens', value=49.0, unit=None)]
+        OpenAIChatCompletionChoice(
            finish_reason='stop',
            index=0,
            message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
                role='assistant',
                content='...**Silent minds awaken,**  \n**Whispers of billions of words,**  \n**Reasoning breaks the night.**  \n\n—  \n*This haiku blends the essence of LLaMA 4\'s capabilities with nature-inspired metaphor, evoking its vast training data and transformative potential.*',
                ...
            ),
            ...
        )
    ],
    ...
 )
 ```
 ### Python SDK
@ -59,14 +73,14 @@ model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
 prompt = "Write a haiku about coding"
 print(f"User> {prompt}")
-response = client.inference.chat_completion(
+response = client.chat.completions.create(
-    model_id=model_id,
+    model=model_id,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt},
    ],
 )
-print(f"Assistant> {response.completion_message.content}")
+print(f"Assistant> {response.choices[0].message.content}")
 ```
 As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!
@ -78,14 +92,14 @@ As more providers start supporting Llama 4, you can use them in Llama Stack as w
 To try Llama Stack locally, run:
 ```bash
-curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/scripts/install.sh | bash
+curl -LsSf https://github.com/llamastack/llama-stack/raw/main/scripts/install.sh | bash
 ```
 ### Overview
 Llama Stack standardizes the core building blocks that simplify AI application development. It codifies best practices across the Llama ecosystem. More specifically, it provides
- **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
+- **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals.
 - **Plugin architecture** to support the rich ecosystem of different API implementations in various environments, including local development, on-premises, cloud, and mobile.
 - **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment.
 - **Multiple developer interfaces** like CLI and SDKs for Python, Typescript, iOS, and Android.
@ -109,38 +123,38 @@ By reducing friction and complexity, Llama Stack empowers developers to focus on
 ### API Providers
 Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack.
-Please checkout for [full list](https://llamastack.github.io/latest/providers/index.html)
+Please checkout for [full list](https://llamastack.github.io/docs/providers)
-| API Provider Builder | Environments | Agents | Inference | VectorIO | Safety | Telemetry | Post Training | Eval | DatasetIO |
+| API Provider Builder | Environments | Agents | Inference | VectorIO | Safety | Post Training | Eval | DatasetIO |
-|:--------------------:|:------------:|:------:|:---------:|:--------:|:------:|:---------:|:-------------:|:----:|:--------:|
+|:--------------------:|:------------:|:------:|:---------:|:--------:|:------:|:-------------:|:----:|:--------:|
-|    Meta Reference    | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|    Meta Reference    | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-|      SambaNova       | Hosted | | ✅ | | ✅ | | | | |
+|      SambaNova       | Hosted | | ✅ | | ✅ | | | |
-|       Cerebras       | Hosted | | ✅ | | | | | | |
+|       Cerebras       | Hosted | | ✅ | | | | | |
-|      Fireworks       | Hosted | ✅ | ✅ | ✅ | | | | | |
+|      Fireworks       | Hosted | ✅ | ✅ | ✅ | | | | |
-|     AWS Bedrock      | Hosted | | ✅ | | ✅ | | | | |
+|     AWS Bedrock      | Hosted | | ✅ | | ✅ | | | |
-|       Together       | Hosted | ✅ | ✅ | | ✅ | | | | |
+|       Together       | Hosted | ✅ | ✅ | | ✅ | | | |
-|         Groq         | Hosted | | ✅ | | | | | | |
+|         Groq         | Hosted | | ✅ | | | | | |
-|        Ollama        | Single Node | | ✅ | | | | | | |
+|        Ollama        | Single Node | | ✅ | | | | | |
-|         TGI          | Hosted/Single Node | | ✅ | | | | | | |
+|         TGI          | Hosted/Single Node | | ✅ | | | | | |
-|      NVIDIA NIM      | Hosted/Single Node | | ✅ | | ✅ | | | | |
+|      NVIDIA NIM      | Hosted/Single Node | | ✅ | | ✅ | | | |
-|       ChromaDB       | Hosted/Single Node | | | ✅ | | | | | |
+|       ChromaDB       | Hosted/Single Node | | | ✅ | | | | |
-|        Milvus        | Hosted/Single Node | | | ✅ | | | | | |
+|        Milvus        | Hosted/Single Node | | | ✅ | | | | |
-|        Qdrant        | Hosted/Single Node | | | ✅ | | | | | |
+|        Qdrant        | Hosted/Single Node | | | ✅ | | | | |
-|       Weaviate       | Hosted/Single Node | | | ✅ | | | | | |
+|       Weaviate       | Hosted/Single Node | | | ✅ | | | | |
-|      SQLite-vec      | Single Node | | | ✅ | | | | | |
+|      SQLite-vec      | Single Node | | | ✅ | | | | |
-|      PG Vector       | Single Node | | | ✅ | | | | | |
+|      PG Vector       | Single Node | | | ✅ | | | | |
-|  PyTorch ExecuTorch  | On-device iOS | ✅ | ✅ | | | | | | |
+|  PyTorch ExecuTorch  | On-device iOS | ✅ | ✅ | | | | | |
-|         vLLM         | Single Node | | ✅ | | | | | | |
+|         vLLM         | Single Node | | ✅ | | | | | |
-|        OpenAI        | Hosted | | ✅ | | | | | | |
+|        OpenAI        | Hosted | | ✅ | | | | | |
-|      Anthropic       | Hosted | | ✅ | | | | | | |
+|      Anthropic       | Hosted | | ✅ | | | | | |
-|        Gemini        | Hosted | | ✅ | | | | | | |
+|        Gemini        | Hosted | | ✅ | | | | | |
-|       WatsonX        | Hosted | | ✅ | | | | | | |
+|       WatsonX        | Hosted | | ✅ | | | | | |
-|     HuggingFace      | Single Node | | | | | | ✅ | | ✅ |
+|     HuggingFace      | Single Node | | | | | ✅ | | ✅ |
-|      TorchTune       | Single Node | | | | | | ✅ | | |
+|      TorchTune       | Single Node | | | | | ✅ | | |
-|     NVIDIA NEMO      | Hosted | | ✅ | ✅ | | | ✅ | ✅ | ✅ |
+|     NVIDIA NEMO      | Hosted | | ✅ | ✅ | | ✅ | ✅ | ✅ |
-|        NVIDIA        | Hosted | | | | | | ✅ | ✅ | ✅ |
+|        NVIDIA        | Hosted | | | | | ✅ | ✅ | ✅ |
-> **Note**: Additional providers are available through external packages. See [External Providers](https://llamastack.github.io/latest/providers/external/index.html) documentation.
+> **Note**: Additional providers are available through external packages. See [External Providers](https://llamastack.github.io/docs/providers/external) documentation.
 ### Distributions
--- a/benchmarking/k8s-benchmark/README.md
+++ b/benchmarking/k8s-benchmark/README.md
@ -26,6 +26,7 @@ The benchmark suite measures critical performance indicators:
 - **Throughput**: Requests per second under sustained load
 - **Latency Distribution**: P50, P95, P99 response times
 - **Time to First Token (TTFT)**: Critical for streaming applications
 - **Inter-Token Latency (ITL)**: Token generation speed for streaming
 - **Error Rates**: Request failures and timeout analysis
 This data enables data-driven architectural decisions and performance optimization efforts.
@ -49,49 +50,148 @@ kubectl get pods
 # Should see: llama-stack-benchmark-server, vllm-server, etc.
 ```
 ## Benchmark Results
 We use [GuideLLM](https://github.com/neuralmagic/guidellm) against our k8s deployment for comprehensive performance testing.
 ### Performance - 1 vLLM Replica
 We vary the number of Llama Stack replicas with 1 vLLM replica and compare performance below.
 ![Performance - 1 vLLM Replica](results/vllm_replica1_benchmark_results.png)
 For full results see the `benchmarking/k8s-benchmark/results/` directory.
 ## Quick Start
-### Basic Benchmarks
+Follow the instructions below to run benchmarks similar to the ones above.
-**Benchmark Llama Stack (default):**
+### Comprehensive Benchmark Suite
 **Run all benchmarks with different cluster configurations:**
 ```bash
-./run-benchmark.sh
+./scripts/run-all-benchmarks.sh
 ```
-**Benchmark vLLM direct:**
+This script will automatically:
 - Scale deployments to different configurations
 - Run benchmarks for each setup
 - Generate output files with meaningful names that include setup information
 ### Individual Benchmarks
 **Benchmark Llama Stack (runs against current cluster setup):**
 ```bash
-./run-benchmark.sh --target vllm
+./scripts/run-guidellm-benchmark.sh --target stack
 ```
-### Custom Configuration
+**Benchmark vLLM direct (runs against current cluster setup):**
 **Extended benchmark with high concurrency:**
 ```bash
-./run-benchmark.sh --target vllm --duration 120 --concurrent 20
+./scripts/run-guidellm-benchmark.sh --target vllm
 ```
-**Short test run:**
+**Benchmark with custom parameters:**
 ```bash
-./run-benchmark.sh --target stack --duration 30 --concurrent 5
+./scripts/run-guidellm-benchmark.sh --target stack --max-seconds 120 --prompt-tokens 1024 --output-tokens 512
 ```
 **Benchmark with custom output file:**
 ```bash
 ./scripts/run-guidellm-benchmark.sh --target stack --output-file results/my-custom-benchmark.txt
 ```
 ### Generating Charts
 Once the benchmarks are run, you can generate performance charts from benchmark results:
 ```bash
 uv run ./scripts/generate_charts.py
 ```
 This loads runs in the `results/` directory and creates visualizations comparing different configurations and replica counts.
 ## Benchmark Workflow
 The benchmark suite is organized into two main scripts with distinct responsibilities:
 ### 1. `run-all-benchmarks.sh` - Orchestration & Scaling
 - **Purpose**: Manages different cluster configurations and orchestrates benchmark runs
 - **Responsibilities**:
  - Scales Kubernetes deployments (vLLM replicas, Stack replicas, worker counts)
  - Runs benchmarks for each configuration
  - Generates meaningful output filenames with setup information
 - **Use case**: Running comprehensive performance testing across multiple configurations
 ### 2. `run-guidellm-benchmark.sh` - Single Benchmark Execution
 - **Purpose**: Executes a single benchmark against the current cluster state
 - **Responsibilities**:
  - Runs GuideLLM benchmark with configurable parameters
  - Accepts custom output file paths
  - No cluster scaling - benchmarks current deployment state
 - **Use case**: Testing specific configurations or custom scenarios
 ### Typical Workflow
 1. **Comprehensive Testing**: Use `run-all-benchmarks.sh` to automatically test multiple configurations
 2. **Custom Testing**: Use `run-guidellm-benchmark.sh` for specific parameter testing or manual cluster configurations
 3. **Analysis**: Use `generate_charts.py` to visualize results from either approach
 ## Command Reference
-### run-benchmark.sh Options
+### run-all-benchmarks.sh
 Orchestrates multiple benchmark runs with different cluster configurations. This script:
 - Automatically scales deployments before each benchmark
 - Runs benchmarks against the configured cluster setup
 - Generates meaningfully named output files
 ```bash
-./run-benchmark.sh [options]
+./scripts/run-all-benchmarks.sh
 ```
 **Configuration**: Edit the `configs` array in the script to customize benchmark configurations:
 ```bash
 # Each line: (target, stack_replicas, vllm_replicas, stack_workers)
 configs=(
    "stack 1 1 1"
    "stack 1 1 2"
    "stack 1 1 4"
    "vllm 1 1 -"
 )
 ```
 **Output files**: Generated with setup information in filename:
 - Stack: `guidellm-benchmark-stack-s{replicas}-sw{workers}-v{vllm_replicas}-{timestamp}.txt`
 - vLLM: `guidellm-benchmark-vllm-v{vllm_replicas}-{timestamp}.txt`
 ### run-guidellm-benchmark.sh Options
 Runs a single benchmark against the current cluster setup (no scaling).
 ```bash
 ./scripts/run-guidellm-benchmark.sh [options]
 Options:
  -t, --target <stack|vllm>     Target to benchmark (default: stack)
-  -d, --duration <seconds>      Duration in seconds (default: 60)
+  -s, --max-seconds <seconds>   Maximum duration in seconds (default: 60)
-  -c, --concurrent <users>      Number of concurrent users (default: 10)
+  -p, --prompt-tokens <tokens>  Number of prompt tokens (default: 512)
  -o, --output-tokens <tokens>  Number of output tokens (default: 256)
  -r, --rate-type <type>        Rate type (default: concurrent)
  -c, --rate                    Rate (default: 1,2,4,8,16,32,64,128)
  --output-file <path>          Output file path (default: auto-generated)
  --stack-deployment <name>     Name of the stack deployment (default: llama-stack-benchmark-server)
  --vllm-deployment <name>      Name of the vllm deployment (default: vllm-server)
  --stack-url <url>             URL of the stack service (default: http://llama-stack-benchmark-service:8323/v1/openai)
  -h, --help                    Show help message
 Examples:
-  ./run-benchmark.sh --target vllm              # Benchmark vLLM direct
+  ./scripts/run-guidellm-benchmark.sh --target vllm                              # Benchmark vLLM direct
-  ./run-benchmark.sh --target stack             # Benchmark Llama Stack
+  ./scripts/run-guidellm-benchmark.sh --target stack                             # Benchmark Llama Stack (default)
-  ./run-benchmark.sh -t vllm -d 120 -c 20       # vLLM with 120s, 20 users
+  ./scripts/run-guidellm-benchmark.sh -t vllm -s 60 -p 512 -o 256               # vLLM with custom parameters
  ./scripts/run-guidellm-benchmark.sh --output-file results/my-benchmark.txt     # Specify custom output file
  ./scripts/run-guidellm-benchmark.sh --stack-deployment my-stack-server         # Use custom stack deployment name
 ```
 ## Local Testing
@ -100,55 +200,30 @@ Examples:
 For local development without Kubernetes:
-**1. Start OpenAI mock server:**
+**1. (Optional) Start Mock OpenAI server:**
 ```bash
 uv run python openai-mock-server.py --port 8080
 ```
 **2. Run benchmark against mock server:**
 ```bash
 uv run python benchmark.py \
  --base-url http://localhost:8080/v1 \
  --model mock-inference \
  --duration 30 \
  --concurrent 5
 ```
 **3. Test against local vLLM server:**
 ```bash
 # If you have vLLM running locally on port 8000
 uv run python benchmark.py \
  --base-url http://localhost:8000/v1 \
  --model meta-llama/Llama-3.2-3B-Instruct \
  --duration 30 \
  --concurrent 5
 ```
 **4. Profile the running server:**
 ```bash
 ./profile_running_server.sh
 ```
 ### OpenAI Mock Server
 There is a simple mock OpenAI server if you don't have an inference provider available.
 The `openai-mock-server.py` provides:
 - **OpenAI-compatible API** for testing without real models
 - **Configurable streaming delay** via `STREAM_DELAY_SECONDS` env var
 - **Consistent responses** for reproducible benchmarks
 - **Lightweight testing** without GPU requirements
 **Mock server usage:**
 ```bash
 uv run python openai-mock-server.py --port 8080
 ```
-The mock server is also deployed in k8s as `openai-mock-service:8080` and can be used by changing the Llama Stack configuration to use the `mock-vllm-inference` provider.
+**2. Start Stack server:**
 ```bash
 LLAMA_STACK_CONFIG=benchmarking/k8s-benchmark/stack_run_config.yaml uv run uvicorn llama_stack.core.server.server:create_app --port 8321 --workers 4 --factory
 ```
-## Files in this Directory
+**3. Run GuideLLM benchmark:**
-
+```bash
- `benchmark.py` - Core benchmark script with async streaming support
+GUIDELLM__PREFERRED_ROUTE="chat_completions" uv run guidellm benchmark run \
- `run-benchmark.sh` - Main script with target selection and configuration
+  --target "http://localhost:8321/v1/openai/v1" \
- `openai-mock-server.py` - Mock OpenAI API server for local testing
+  --model "meta-llama/Llama-3.2-3B-Instruct" \
- `README.md` - This documentation file
+  --rate-type sweep \
  --max-seconds 60 \
  --data "prompt_tokens=256,output_tokens=128" --output-path='output.html'
 ```
--- a/benchmarking/k8s-benchmark/benchmark.py
+++ b/benchmarking/k8s-benchmark/benchmark.py
@ -1,265 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 """
 Simple benchmark script for Llama Stack with OpenAI API compatibility.
 """
 import argparse
 import asyncio
 import os
 import random
 import statistics
 import time
 import aiohttp
 class BenchmarkStats:
    def __init__(self):
        self.response_times = []
        self.ttft_times = []
        self.chunks_received = []
        self.errors = []
        self.success_count = 0
        self.total_requests = 0
        self.concurrent_users = 0
        self.start_time = None
        self.end_time = None
        self._lock = asyncio.Lock()
    async def add_result(self, response_time: float, chunks: int, ttft: float = None, error: str = None):
        async with self._lock:
            self.total_requests += 1
            if error:
                self.errors.append(error)
            else:
                self.success_count += 1
                self.response_times.append(response_time)
                self.chunks_received.append(chunks)
                if ttft is not None:
                    self.ttft_times.append(ttft)
    def print_summary(self):
        if not self.response_times:
            print("No successful requests to report")
            if self.errors:
                print(f"Total errors: {len(self.errors)}")
                print("First 5 errors:")
                for error in self.errors[:5]:
                    print(f"  {error}")
            return
        total_time = self.end_time - self.start_time
        success_rate = (self.success_count / self.total_requests) * 100
        print(f"\n{'=' * 60}")
        print("BENCHMARK RESULTS")
        print("\nResponse Time Statistics:")
        print(f"  Mean: {statistics.mean(self.response_times):.3f}s")
        print(f"  Median: {statistics.median(self.response_times):.3f}s")
        print(f"  Min: {min(self.response_times):.3f}s")
        print(f"  Max: {max(self.response_times):.3f}s")
        if len(self.response_times) > 1:
            print(f"  Std Dev: {statistics.stdev(self.response_times):.3f}s")
        percentiles = [50, 90, 95, 99]
        sorted_times = sorted(self.response_times)
        print("\nPercentiles:")
        for p in percentiles:
            idx = int(len(sorted_times) * p / 100) - 1
            idx = max(0, min(idx, len(sorted_times) - 1))
            print(f"  P{p}: {sorted_times[idx]:.3f}s")
        if self.ttft_times:
            print("\nTime to First Token (TTFT) Statistics:")
            print(f"  Mean: {statistics.mean(self.ttft_times):.3f}s")
            print(f"  Median: {statistics.median(self.ttft_times):.3f}s")
            print(f"  Min: {min(self.ttft_times):.3f}s")
            print(f"  Max: {max(self.ttft_times):.3f}s")
            if len(self.ttft_times) > 1:
                print(f"  Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
            sorted_ttft = sorted(self.ttft_times)
            print("\nTTFT Percentiles:")
            for p in percentiles:
                idx = int(len(sorted_ttft) * p / 100) - 1
                idx = max(0, min(idx, len(sorted_ttft) - 1))
                print(f"  P{p}: {sorted_ttft[idx]:.3f}s")
        if self.chunks_received:
            print("\nStreaming Statistics:")
            print(f"  Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
            print(f"  Total chunks received: {sum(self.chunks_received)}")
        print(f"{'=' * 60}")
        print(f"Total time: {total_time:.2f}s")
        print(f"Concurrent users: {self.concurrent_users}")
        print(f"Total requests: {self.total_requests}")
        print(f"Successful requests: {self.success_count}")
        print(f"Failed requests: {len(self.errors)}")
        print(f"Success rate: {success_rate:.1f}%")
        print(f"Requests per second: {self.success_count / total_time:.2f}")
        if self.errors:
            print("\nErrors (showing first 5):")
            for error in self.errors[:5]:
                print(f"  {error}")
 class LlamaStackBenchmark:
    def __init__(self, base_url: str, model_id: str):
        self.base_url = base_url.rstrip("/")
        self.model_id = model_id
        self.headers = {"Content-Type": "application/json"}
        self.test_messages = [
            [{"role": "user", "content": "Hi"}],
            [{"role": "user", "content": "What is the capital of France?"}],
            [{"role": "user", "content": "Explain quantum physics in simple terms."}],
            [{"role": "user", "content": "Write a short story about a robot learning to paint."}],
            [
                {"role": "user", "content": "What is machine learning?"},
                {"role": "assistant", "content": "Machine learning is a subset of AI..."},
                {"role": "user", "content": "Can you give me a practical example?"},
            ],
        ]
    async def make_async_streaming_request(self) -> tuple[float, int, float | None, str | None]:
        """Make a single async streaming chat completion request."""
        messages = random.choice(self.test_messages)
        payload = {"model": self.model_id, "messages": messages, "stream": True, "max_tokens": 100}
        start_time = time.time()
        chunks_received = 0
        ttft = None
        error = None
        session = aiohttp.ClientSession()
        try:
            async with session.post(
                f"{self.base_url}/chat/completions",
                headers=self.headers,
                json=payload,
                timeout=aiohttp.ClientTimeout(total=30),
            ) as response:
                if response.status == 200:
                    async for line in response.content:
                        if line:
                            line_str = line.decode("utf-8").strip()
                            if line_str.startswith("data: "):
                                chunks_received += 1
                                if ttft is None:
                                    ttft = time.time() - start_time
                                if line_str == "data: [DONE]":
                                    break
                    if chunks_received == 0:
                        error = "No streaming chunks received"
                else:
                    text = await response.text()
                    error = f"HTTP {response.status}: {text[:100]}"
        except Exception as e:
            error = f"Request error: {str(e)}"
        finally:
            await session.close()
        response_time = time.time() - start_time
        return response_time, chunks_received, ttft, error
    async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
        """Run benchmark using async requests for specified duration."""
        stats = BenchmarkStats()
        stats.concurrent_users = concurrent_users
        stats.start_time = time.time()
        print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users")
        print(f"Target URL: {self.base_url}/chat/completions")
        print(f"Model: {self.model_id}")
        connector = aiohttp.TCPConnector(limit=concurrent_users)
        async with aiohttp.ClientSession(connector=connector):
            async def worker(worker_id: int):
                """Worker that sends requests sequentially until canceled."""
                request_count = 0
                while True:
                    try:
                        response_time, chunks, ttft, error = await self.make_async_streaming_request()
                        await stats.add_result(response_time, chunks, ttft, error)
                        request_count += 1
                    except asyncio.CancelledError:
                        break
                    except Exception as e:
                        await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}")
            # Progress reporting task
            async def progress_reporter():
                last_report_time = time.time()
                while True:
                    try:
                        await asyncio.sleep(1)  # Report every second
                        if time.time() >= last_report_time + 10:  # Report every 10 seconds
                            elapsed = time.time() - stats.start_time
                            print(
                                f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}"
                            )
                            last_report_time = time.time()
                    except asyncio.CancelledError:
                        break
            # Spawn concurrent workers
            tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)]
            progress_task = asyncio.create_task(progress_reporter())
            tasks.append(progress_task)
            # Wait for duration then cancel all tasks
            await asyncio.sleep(duration)
            for task in tasks:
                task.cancel()
            # Wait for all tasks to complete
            await asyncio.gather(*tasks, return_exceptions=True)
        stats.end_time = time.time()
        return stats
 def main():
    parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
    parser.add_argument(
        "--base-url",
        default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
        help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)",
    )
    parser.add_argument(
        "--model", default=os.getenv("INFERENCE_MODEL", "test-model"), help="Model ID to use for requests"
    )
    parser.add_argument("--duration", type=int, default=60, help="Duration in seconds to run benchmark (default: 60)")
    parser.add_argument("--concurrent", type=int, default=10, help="Number of concurrent users (default: 10)")
    args = parser.parse_args()
    benchmark = LlamaStackBenchmark(args.base_url, args.model)
    try:
        stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent))
        stats.print_summary()
    except KeyboardInterrupt:
        print("\nBenchmark interrupted by user")
    except Exception as e:
        print(f"Benchmark failed: {e}")
 if __name__ == "__main__":
    main()
--- a/benchmarking/k8s-benchmark/profile_running_server.sh
+++ b/benchmarking/k8s-benchmark/profile_running_server.sh
@ -1,52 +0,0 @@
 #!/bin/bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Script to profile an already running Llama Stack server
 # Usage: ./profile_running_server.sh [duration_seconds] [output_file]
 DURATION=${1:-60}  # Default 60 seconds
 OUTPUT_FILE=${2:-"llama_stack_profile"}  # Default output file
 echo "Looking for running Llama Stack server..."
 # Find the server PID
 SERVER_PID=$(ps aux | grep "llama_stack.core.server.server" | grep -v grep | awk '{print $2}' | head -1)
 if [ -z "$SERVER_PID" ]; then
    echo "Error: No running Llama Stack server found"
    echo "Please start your server first with:"
    echo "LLAMA_STACK_LOGGING=\"all=ERROR\" MOCK_INFERENCE_URL=http://localhost:8080 SAFETY_MODEL=llama-guard3:1b uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml"
    exit 1
 fi
 echo "Found Llama Stack server with PID: $SERVER_PID"
 # Start py-spy profiling
 echo "Starting py-spy profiling for ${DURATION} seconds..."
 echo "Output will be saved to: ${OUTPUT_FILE}.svg"
 echo ""
 echo "You can now run your load test..."
 echo ""
 # Get the full path to py-spy
 PYSPY_PATH=$(which py-spy)
 # Check if running as root, if not, use sudo
 if [ "$EUID" -ne 0 ]; then
    echo "py-spy requires root permissions on macOS. Running with sudo..."
    sudo "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
 else
    "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
 fi
 echo ""
 echo "Profiling completed! Results saved to: ${OUTPUT_FILE}.svg"
 echo ""
 echo "To view the flame graph:"
 echo "open ${OUTPUT_FILE}.svg"
--- a/benchmarking/k8s-benchmark/results/guidellm-benchmark-stack-s1-sw1-v1-20250922-103408.txt
+++ b/benchmarking/k8s-benchmark/results/guidellm-benchmark-stack-s1-sw1-v1-20250922-103408.txt
@ -0,0 +1,171 @@
 Collecting uv
  Downloading uv-0.8.19-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
 Downloading uv-0.8.19-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.9 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 20.9/20.9 MB 144.3 MB/s eta 0:00:00
 Installing collected packages: uv
 Successfully installed uv-0.8.19
 WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
 [notice] A new release of pip is available: 24.0 -> 25.2
 [notice] To update, run: pip install --upgrade pip
 Using Python 3.11.13 environment at: /usr/local
 Resolved 61 packages in 551ms
 Downloading pillow (6.3MiB)
 Downloading hf-xet (3.0MiB)
 Downloading tokenizers (3.1MiB)
 Downloading pygments (1.2MiB)
 Downloading pandas (11.8MiB)
 Downloading aiohttp (1.7MiB)
 Downloading pydantic-core (1.9MiB)
 Downloading numpy (16.2MiB)
 Downloading transformers (11.1MiB)
 Downloading pyarrow (40.8MiB)
 Downloading pydantic-core
 Downloading aiohttp
 Downloading tokenizers
 Downloading hf-xet
 Downloading pygments
 Downloading pillow
 Downloading numpy
 Downloading pandas
 Downloading transformers
 Downloading pyarrow
 Prepared 61 packages in 1.23s
 Installed 61 packages in 114ms
 + aiohappyeyeballs==2.6.1
 + aiohttp==3.12.15
 + aiosignal==1.4.0
 + annotated-types==0.7.0
 + anyio==4.10.0
 + attrs==25.3.0
 + certifi==2025.8.3
 + charset-normalizer==3.4.3
 + click==8.1.8
 + datasets==4.1.1
 + dill==0.4.0
 + filelock==3.19.1
 + frozenlist==1.7.0
 + fsspec==2025.9.0
 + ftfy==6.3.1
 + guidellm==0.3.0
 + h11==0.16.0
 + h2==4.3.0
 + hf-xet==1.1.10
 + hpack==4.1.0
 + httpcore==1.0.9
 + httpx==0.28.1
 + huggingface-hub==0.35.0
 + hyperframe==6.1.0
 + idna==3.10
 + loguru==0.7.3
 + markdown-it-py==4.0.0
 + mdurl==0.1.2
 + multidict==6.6.4
 + multiprocess==0.70.16
 + numpy==2.3.3
 + packaging==25.0
 + pandas==2.3.2
 + pillow==11.3.0
 + propcache==0.3.2
 + protobuf==6.32.1
 + pyarrow==21.0.0
 + pydantic==2.11.9
 + pydantic-core==2.33.2
 + pydantic-settings==2.10.1
 + pygments==2.19.2
 + python-dateutil==2.9.0.post0
 + python-dotenv==1.1.1
 + pytz==2025.2
 + pyyaml==6.0.2
 + regex==2025.9.18
 + requests==2.32.5
 + rich==14.1.0
 + safetensors==0.6.2
 + six==1.17.0
 + sniffio==1.3.1
 + tokenizers==0.22.1
 + tqdm==4.67.1
 + transformers==4.56.2
 + typing-extensions==4.15.0
 + typing-inspection==0.4.1
 + tzdata==2025.2
 + urllib3==2.5.0
 + wcwidth==0.2.14
 + xxhash==3.5.0
 + yarl==1.20.1
 Using Python 3.11.13 environment at: /usr/local
 Audited 1 package in 3ms
 Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
 Creating backend...
 Backend openai_http connected to http://llama-stack-benchmark-service:8323/v1/openai for model meta-llama/Llama-3.2-3B-Instruct.
 Creating request loader...
 Created loader with 1000 unique requests from prompt_tokens=512,output_tokens=256.
 ╭─ Benchmarks ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
 │ [17:34:30] ⠋ 100% concurrent@1   (complete)   Req:    0.3 req/s,    3.32s Lat,     1.0 Conc,      18 Comp,        1 Inc,        0 Err                                                                │
 │                                               Tok:   74.0 gen/s,  238.6 tot/s,  40.2ms TTFT,   13.4ms ITL,   546 Prompt,      246 Gen                                                                │
 │ [17:35:35] ⠋ 100% concurrent@2   (complete)   Req:    0.6 req/s,    3.46s Lat,     2.0 Conc,      34 Comp,        2 Inc,        0 Err                                                                │
 │                                               Tok:  139.6 gen/s,  454.0 tot/s,  48.0ms TTFT,   14.1ms ITL,   546 Prompt,      243 Gen                                                                │
 │ [17:36:40] ⠋ 100% concurrent@4   (complete)   Req:    1.1 req/s,    3.44s Lat,     3.9 Conc,      68 Comp,        4 Inc,        0 Err                                                                │
 │                                               Tok:  273.2 gen/s,  900.4 tot/s,  50.7ms TTFT,   14.3ms ITL,   546 Prompt,      238 Gen                                                                │
 │ [17:37:45] ⠋ 100% concurrent@8   (complete)   Req:    2.2 req/s,    3.55s Lat,     7.7 Conc,     129 Comp,        8 Inc,        0 Err                                                                │
 │                                               Tok:  519.1 gen/s, 1699.8 tot/s,  66.0ms TTFT,   14.6ms ITL,   547 Prompt,      240 Gen                                                                │
 │ [17:38:50] ⠋ 100% concurrent@16  (complete)   Req:    4.1 req/s,    3.76s Lat,    15.5 Conc,     247 Comp,       16 Inc,        0 Err                                                                │
 │                                               Tok: 1005.5 gen/s, 3256.7 tot/s, 101.0ms TTFT,   15.0ms ITL,   547 Prompt,      244 Gen                                                                │
 │ [17:39:56] ⠋ 100% concurrent@32  (complete)   Req:    8.1 req/s,    3.84s Lat,    30.9 Conc,     483 Comp,       32 Inc,        0 Err                                                                │
 │                                               Tok: 1926.3 gen/s, 6327.2 tot/s, 295.7ms TTFT,   14.8ms ITL,   547 Prompt,      239 Gen                                                                │
 │ [17:41:03] ⠋ 100% concurrent@64  (complete)   Req:    9.9 req/s,    6.05s Lat,    59.7 Conc,     576 Comp,       58 Inc,        0 Err                                                                │
 │                                               Tok: 2381.0 gen/s, 7774.5 tot/s, 1196.2ms TTFT,   20.2ms ITL,   547 Prompt,      241 Gen                                                               │
 │ [17:42:10] ⠋ 100% concurrent@128 (complete)   Req:    9.2 req/s,   11.59s Lat,   107.2 Conc,     514 Comp,      117 Inc,        0 Err                                                                │
 │                                               Tok: 2233.4 gen/s, 7286.3 tot/s, 2403.9ms TTFT,   38.2ms ITL,   547 Prompt,      242 Gen                                                               │
 ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 Generating... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ (8/8) [ 0:08:41 < 0:00:00 ]
 Benchmarks Metadata:
    Run id:511a14fd-ba11-4ffa-92ef-7cc23db4dd38
    Duration:528.5 seconds
    Profile:type=concurrent, strategies=['concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent'], streams=[1, 2, 4, 8, 16, 32, 64, 128]
    Args:max_number=None, max_duration=60.0, warmup_number=None, warmup_duration=3.0, cooldown_number=None, cooldown_duration=None
    Worker:type_='generative_requests_worker' backend_type='openai_http' backend_target='http://llama-stack-benchmark-service:8323/v1/openai' backend_model='meta-llama/Llama-3.2-3B-Instruct'
    backend_info={'max_output_tokens': 16384, 'timeout': 300, 'http2': True, 'follow_redirects': True, 'headers': {}, 'text_completions_path': '/v1/completions', 'chat_completions_path':
    '/v1/chat/completions'}
    Request Loader:type_='generative_request_loader' data='prompt_tokens=512,output_tokens=256' data_args=None processor='meta-llama/Llama-3.2-3B-Instruct' processor_args=None
    Extras:None
 Benchmarks Info:
 ===================================================================================================================================================
 Metadata                                       |||| Requests Made  ||| Prompt Tok/Req ||| Output Tok/Req ||| Prompt Tok Total||| Output Tok Total||
     Benchmark| Start Time| End Time| Duration (s)|  Comp|  Inc|  Err|  Comp|   Inc| Err|  Comp|   Inc| Err|   Comp|   Inc| Err|   Comp|   Inc| Err
 --------------|-----------|---------|-------------|------|-----|-----|------|------|----|------|------|----|-------|------|----|-------|------|----
  concurrent@1|   17:34:35| 17:35:35|         60.0|    18|    1|    0| 546.4| 512.0| 0.0| 246.0|  14.0| 0.0|   9835|   512|   0|   4428|    14|   0
  concurrent@2|   17:35:40| 17:36:40|         60.0|    34|    2|    0| 546.4| 512.0| 0.0| 242.7|  80.0| 0.0|  18577|  1024|   0|   8253|   160|   0
  concurrent@4|   17:36:45| 17:37:45|         60.0|    68|    4|    0| 546.4| 512.0| 0.0| 238.1| 103.2| 0.0|  37156|  2048|   0|  16188|   413|   0
  concurrent@8|   17:37:50| 17:38:50|         60.0|   129|    8|    0| 546.7| 512.0| 0.0| 240.3| 180.0| 0.0|  70518|  4096|   0|  31001|  1440|   0
 concurrent@16|   17:38:55| 17:39:55|         60.0|   247|   16|    0| 546.6| 512.0| 0.0| 244.1| 142.6| 0.0| 135002|  8192|   0|  60300|  2281|   0
 concurrent@32|   17:40:01| 17:41:01|         60.0|   483|   32|    0| 546.5| 512.0| 0.0| 239.2| 123.2| 0.0| 263972| 16384|   0| 115540|  3944|   0
 concurrent@64|   17:41:08| 17:42:08|         60.0|   576|   58|    0| 546.6| 512.0| 0.0| 241.3|  13.9| 0.0| 314817| 29696|   0| 138976|   807|   0
 concurrent@128|   17:42:15| 17:43:15|         60.0|   514|  117|    0| 546.5| 512.0| 0.0| 241.6| 143.9| 0.0| 280911| 59904|   0| 124160| 16832|   0
 ===================================================================================================================================================
 Benchmarks Stats:
 =======================================================================================================================================================
 Metadata      | Request Stats         || Out Tok/sec| Tot Tok/sec| Req Latency (sec) ||| TTFT (ms)           ||| ITL (ms)        ||| TPOT (ms)       ||
     Benchmark| Per Second| Concurrency|        mean|        mean|  mean| median|   p99|   mean| median|    p99| mean| median|  p99| mean| median|  p99
 --------------|-----------|------------|------------|------------|------|-------|------|-------|-------|-------|-----|-------|-----|-----|-------|-----
  concurrent@1|       0.30|        1.00|        74.0|       238.6|  3.32|   3.43|  3.61|   40.2|   39.3|   51.2| 13.4|   13.3| 14.0| 13.3|   13.2| 13.9
  concurrent@2|       0.58|        1.99|       139.6|       454.0|  3.46|   3.64|  3.74|   48.0|   45.8|   72.0| 14.1|   14.1| 14.5| 14.0|   14.0| 14.4
  concurrent@4|       1.15|        3.95|       273.2|       900.4|  3.44|   3.69|  3.74|   50.7|   47.2|  118.6| 14.3|   14.3| 14.4| 14.2|   14.2| 14.4
  concurrent@8|       2.16|        7.67|       519.1|      1699.8|  3.55|   3.76|  3.87|   66.0|   48.8|  208.2| 14.6|   14.5| 14.8| 14.5|   14.5| 14.8
 concurrent@16|       4.12|       15.48|      1005.5|      3256.7|  3.76|   3.90|  4.18|  101.0|   65.6|  396.7| 15.0|   15.0| 15.9| 15.0|   15.0| 15.9
 concurrent@32|       8.05|       30.89|      1926.3|      6327.2|  3.84|   4.04|  4.39|  295.7|  265.6|  720.4| 14.8|   14.9| 15.5| 14.8|   14.8| 15.3
 concurrent@64|       9.87|       59.74|      2381.0|      7774.5|  6.05|   6.18|  9.94| 1196.2| 1122.5| 4295.3| 20.2|   20.0| 25.8| 20.1|   19.9| 25.8
 concurrent@128|       9.25|      107.16|      2233.4|      7286.3| 11.59|  12.04| 14.46| 2403.9| 2322.3| 4001.5| 38.2|   38.5| 53.0| 38.0|   38.3| 52.7
 =======================================================================================================================================================
 Saving benchmarks report...
 Benchmarks report saved to /benchmarks.json
 Benchmarking complete.
--- a/benchmarking/k8s-benchmark/results/guidellm-benchmark-stack-s1-sw2-v1-20250922-104457.txt
+++ b/benchmarking/k8s-benchmark/results/guidellm-benchmark-stack-s1-sw2-v1-20250922-104457.txt
@ -0,0 +1,171 @@
 Collecting uv
  Downloading uv-0.8.19-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
 Downloading uv-0.8.19-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.9 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 20.9/20.9 MB 149.3 MB/s eta 0:00:00
 Installing collected packages: uv
 Successfully installed uv-0.8.19
 WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
 [notice] A new release of pip is available: 24.0 -> 25.2
 [notice] To update, run: pip install --upgrade pip
 Using Python 3.11.13 environment at: /usr/local
 Resolved 61 packages in 494ms
 Downloading pandas (11.8MiB)
 Downloading tokenizers (3.1MiB)
 Downloading pygments (1.2MiB)
 Downloading aiohttp (1.7MiB)
 Downloading transformers (11.1MiB)
 Downloading numpy (16.2MiB)
 Downloading pillow (6.3MiB)
 Downloading pydantic-core (1.9MiB)
 Downloading hf-xet (3.0MiB)
 Downloading pyarrow (40.8MiB)
 Downloading pydantic-core
 Downloading aiohttp
 Downloading tokenizers
 Downloading hf-xet
 Downloading pillow
 Downloading pygments
 Downloading numpy
 Downloading pandas
 Downloading pyarrow
 Downloading transformers
 Prepared 61 packages in 1.24s
 Installed 61 packages in 126ms
 + aiohappyeyeballs==2.6.1
 + aiohttp==3.12.15
 + aiosignal==1.4.0
 + annotated-types==0.7.0
 + anyio==4.10.0
 + attrs==25.3.0
 + certifi==2025.8.3
 + charset-normalizer==3.4.3
 + click==8.1.8
 + datasets==4.1.1
 + dill==0.4.0
 + filelock==3.19.1
 + frozenlist==1.7.0
 + fsspec==2025.9.0
 + ftfy==6.3.1
 + guidellm==0.3.0
 + h11==0.16.0
 + h2==4.3.0
 + hf-xet==1.1.10
 + hpack==4.1.0
 + httpcore==1.0.9
 + httpx==0.28.1
 + huggingface-hub==0.35.0
 + hyperframe==6.1.0
 + idna==3.10
 + loguru==0.7.3
 + markdown-it-py==4.0.0
 + mdurl==0.1.2
 + multidict==6.6.4
 + multiprocess==0.70.16
 + numpy==2.3.3
 + packaging==25.0
 + pandas==2.3.2
 + pillow==11.3.0
 + propcache==0.3.2
 + protobuf==6.32.1
 + pyarrow==21.0.0
 + pydantic==2.11.9
 + pydantic-core==2.33.2
 + pydantic-settings==2.10.1
 + pygments==2.19.2
 + python-dateutil==2.9.0.post0
 + python-dotenv==1.1.1
 + pytz==2025.2
 + pyyaml==6.0.2
 + regex==2025.9.18
 + requests==2.32.5
 + rich==14.1.0
 + safetensors==0.6.2
 + six==1.17.0
 + sniffio==1.3.1
 + tokenizers==0.22.1
 + tqdm==4.67.1
 + transformers==4.56.2
 + typing-extensions==4.15.0
 + typing-inspection==0.4.1
 + tzdata==2025.2
 + urllib3==2.5.0
 + wcwidth==0.2.14
 + xxhash==3.5.0
 + yarl==1.20.1
 Using Python 3.11.13 environment at: /usr/local
 Audited 1 package in 3ms
 Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
 Creating backend...
 Backend openai_http connected to http://llama-stack-benchmark-service:8323/v1/openai for model meta-llama/Llama-3.2-3B-Instruct.
 Creating request loader...
 Created loader with 1000 unique requests from prompt_tokens=512,output_tokens=256.
 ╭─ Benchmarks ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
 │ [17:45:18] ⠋ 100% concurrent@1   (complete)   Req:    0.3 req/s,    3.42s Lat,     1.0 Conc,      17 Comp,        1 Inc,        0 Err                                                                │
 │                                               Tok:   73.9 gen/s,  233.7 tot/s,  50.2ms TTFT,   13.4ms ITL,   547 Prompt,      253 Gen                                                                │
 │ [17:46:23] ⠋ 100% concurrent@2   (complete)   Req:    0.6 req/s,    3.42s Lat,     2.0 Conc,      34 Comp,        2 Inc,        0 Err                                                                │
 │                                               Tok:  134.7 gen/s,  447.4 tot/s,  50.8ms TTFT,   14.3ms ITL,   546 Prompt,      235 Gen                                                                │
 │ [17:47:28] ⠋ 100% concurrent@4   (complete)   Req:    1.1 req/s,    3.55s Lat,     3.9 Conc,      66 Comp,        4 Inc,        0 Err                                                                │
 │                                               Tok:  268.7 gen/s,  873.1 tot/s,  54.9ms TTFT,   14.4ms ITL,   547 Prompt,      243 Gen                                                                │
 │ [17:48:33] ⠋ 100% concurrent@8   (complete)   Req:    2.2 req/s,    3.56s Lat,     7.8 Conc,     130 Comp,        8 Inc,        0 Err                                                                │
 │                                               Tok:  526.1 gen/s, 1728.4 tot/s,  60.6ms TTFT,   14.7ms ITL,   547 Prompt,      239 Gen                                                                │
 │ [17:49:38] ⠋ 100% concurrent@16  (complete)   Req:    4.1 req/s,    3.79s Lat,    15.7 Conc,     246 Comp,       16 Inc,        0 Err                                                                │
 │                                               Tok: 1006.9 gen/s, 3268.6 tot/s,  74.8ms TTFT,   15.3ms ITL,   547 Prompt,      243 Gen                                                                │
 │ [17:50:44] ⠋ 100% concurrent@32  (complete)   Req:    7.8 req/s,    3.95s Lat,    30.9 Conc,     467 Comp,       32 Inc,        0 Err                                                                │
 │                                               Tok: 1912.0 gen/s, 6191.6 tot/s, 119.1ms TTFT,   15.7ms ITL,   547 Prompt,      244 Gen                                                                │
 │ [17:51:50] ⠋ 100% concurrent@64  (complete)   Req:   13.0 req/s,    4.75s Lat,    61.8 Conc,     776 Comp,       64 Inc,        0 Err                                                                │
 │                                               Tok: 3154.3 gen/s, 10273.3 tot/s, 339.1ms TTFT,   18.3ms ITL,   547 Prompt,      242 Gen                                                               │
 │ [17:52:58] ⠋ 100% concurrent@128 (complete)   Req:   15.1 req/s,    7.82s Lat,   117.7 Conc,     898 Comp,      127 Inc,        0 Err                                                                │
 │                                               Tok: 3617.4 gen/s, 11843.9 tot/s, 1393.8ms TTFT,   26.8ms ITL,   547 Prompt,      240 Gen                                                              │
 ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 Generating... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ (8/8) [ 0:08:41 < 0:00:00 ]
 Benchmarks Metadata:
    Run id:f73d408e-256a-4c32-aa40-05e8d7098b66
    Duration:529.2 seconds
    Profile:type=concurrent, strategies=['concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent'], streams=[1, 2, 4, 8, 16, 32, 64, 128]
    Args:max_number=None, max_duration=60.0, warmup_number=None, warmup_duration=3.0, cooldown_number=None, cooldown_duration=None
    Worker:type_='generative_requests_worker' backend_type='openai_http' backend_target='http://llama-stack-benchmark-service:8323/v1/openai' backend_model='meta-llama/Llama-3.2-3B-Instruct'
    backend_info={'max_output_tokens': 16384, 'timeout': 300, 'http2': True, 'follow_redirects': True, 'headers': {}, 'text_completions_path': '/v1/completions', 'chat_completions_path':
    '/v1/chat/completions'}
    Request Loader:type_='generative_request_loader' data='prompt_tokens=512,output_tokens=256' data_args=None processor='meta-llama/Llama-3.2-3B-Instruct' processor_args=None
    Extras:None
 Benchmarks Info:
 =====================================================================================================================================================
 Metadata                                       |||| Requests Made  ||| Prompt Tok/Req ||| Output Tok/Req ||| Prompt Tok Total||| Output Tok Total  ||
     Benchmark| Start Time| End Time| Duration (s)|  Comp|  Inc|  Err|  Comp|   Inc| Err|  Comp|   Inc| Err|   Comp|   Inc| Err|    Comp|   Inc|  Err
 --------------|-----------|---------|-------------|------|-----|-----|------|------|----|------|------|----|-------|------|----|--------|------|-----
  concurrent@1|   17:45:23| 17:46:23|         60.0|    17|    1|    0| 546.6| 512.0| 0.0| 252.8| 136.0| 0.0|   9292|   512|   0|    4298|   136|    0
  concurrent@2|   17:46:28| 17:47:28|         60.0|    34|    2|    0| 546.4| 512.0| 0.0| 235.4| 130.0| 0.0|  18577|  1024|   0|    8003|   260|    0
  concurrent@4|   17:47:33| 17:48:33|         60.0|    66|    4|    0| 546.5| 512.0| 0.0| 243.0|  97.5| 0.0|  36072|  2048|   0|   16035|   390|    0
  concurrent@8|   17:48:38| 17:49:38|         60.0|   130|    8|    0| 546.6| 512.0| 0.0| 239.2| 146.0| 0.0|  71052|  4096|   0|   31090|  1168|    0
 concurrent@16|   17:49:43| 17:50:43|         60.0|   246|   16|    0| 546.6| 512.0| 0.0| 243.3| 112.3| 0.0| 134456|  8192|   0|   59862|  1797|    0
 concurrent@32|   17:50:49| 17:51:49|         60.0|   467|   32|    0| 546.6| 512.0| 0.0| 244.2| 147.3| 0.0| 255242| 16384|   0|  114038|  4714|    0
 concurrent@64|   17:51:55| 17:52:55|         60.0|   776|   64|    0| 546.5| 512.0| 0.0| 242.2| 106.1| 0.0| 424115| 32768|   0|  187916|  6788|    0
 concurrent@128|   17:53:03| 17:54:03|         60.0|   898|  127|    0| 546.5| 512.0| 0.0| 240.3|  69.8| 0.0| 490789| 65024|   0|  215810|  8864|    0
 =====================================================================================================================================================
 Benchmarks Stats:
 ======================================================================================================================================================
 Metadata      | Request Stats         || Out Tok/sec| Tot Tok/sec| Req Latency (sec)||| TTFT (ms)           ||| ITL (ms)        ||| TPOT (ms)       ||
     Benchmark| Per Second| Concurrency|        mean|        mean| mean| median|   p99|   mean| median|    p99| mean| median|  p99| mean| median|  p99
 --------------|-----------|------------|------------|------------|-----|-------|------|-------|-------|-------|-----|-------|-----|-----|-------|-----
  concurrent@1|       0.29|        1.00|        73.9|       233.7| 3.42|   3.45|  3.50|   50.2|   50.9|   62.5| 13.4|   13.4| 13.5| 13.3|   13.3| 13.5
  concurrent@2|       0.57|        1.96|       134.7|       447.4| 3.42|   3.67|  4.12|   50.8|   49.2|   79.8| 14.3|   14.2| 15.9| 14.3|   14.2| 15.9
  concurrent@4|       1.11|        3.92|       268.7|       873.1| 3.55|   3.72|  3.80|   54.9|   51.7|  101.3| 14.4|   14.4| 14.5| 14.4|   14.4| 14.5
  concurrent@8|       2.20|        7.82|       526.1|      1728.4| 3.56|   3.78|  3.93|   60.6|   49.8|  189.5| 14.7|   14.7| 14.8| 14.6|   14.6| 14.8
 concurrent@16|       4.14|       15.66|      1006.9|      3268.6| 3.79|   3.94|  4.25|   74.8|   54.3|  328.4| 15.3|   15.3| 16.1| 15.2|   15.2| 16.0
 concurrent@32|       7.83|       30.91|      1912.0|      6191.6| 3.95|   4.07|  4.53|  119.1|   80.5|  674.0| 15.7|   15.6| 17.4| 15.7|   15.6| 17.3
 concurrent@64|      13.03|       61.85|      3154.3|     10273.3| 4.75|   4.93|  5.43|  339.1|  321.1| 1146.6| 18.3|   18.4| 19.3| 18.2|   18.3| 19.2
 concurrent@128|      15.05|      117.71|      3617.4|     11843.9| 7.82|   8.58| 13.35| 1393.8| 1453.0| 5232.2| 26.8|   26.7| 36.0| 26.7|   26.6| 35.9
 ======================================================================================================================================================
 Saving benchmarks report...
 Benchmarks report saved to /benchmarks.json
 Benchmarking complete.
--- a/benchmarking/k8s-benchmark/results/guidellm-benchmark-stack-s1-sw4-v1-20250922-105539.txt
+++ b/benchmarking/k8s-benchmark/results/guidellm-benchmark-stack-s1-sw4-v1-20250922-105539.txt
@ -0,0 +1,171 @@
 Collecting uv
  Downloading uv-0.8.19-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
 Downloading uv-0.8.19-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.9 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 20.9/20.9 MB 156.8 MB/s eta 0:00:00
 Installing collected packages: uv
 Successfully installed uv-0.8.19
 WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
 [notice] A new release of pip is available: 24.0 -> 25.2
 [notice] To update, run: pip install --upgrade pip
 Using Python 3.11.13 environment at: /usr/local
 Resolved 61 packages in 480ms
 Downloading pillow (6.3MiB)
 Downloading pydantic-core (1.9MiB)
 Downloading pyarrow (40.8MiB)
 Downloading aiohttp (1.7MiB)
 Downloading numpy (16.2MiB)
 Downloading pygments (1.2MiB)
 Downloading transformers (11.1MiB)
 Downloading pandas (11.8MiB)
 Downloading tokenizers (3.1MiB)
 Downloading hf-xet (3.0MiB)
 Downloading pydantic-core
 Downloading aiohttp
 Downloading tokenizers
 Downloading hf-xet
 Downloading pygments
 Downloading pillow
 Downloading numpy
 Downloading pandas
 Downloading pyarrow
 Downloading transformers
 Prepared 61 packages in 1.25s
 Installed 61 packages in 126ms
 + aiohappyeyeballs==2.6.1
 + aiohttp==3.12.15
 + aiosignal==1.4.0
 + annotated-types==0.7.0
 + anyio==4.10.0
 + attrs==25.3.0
 + certifi==2025.8.3
 + charset-normalizer==3.4.3
 + click==8.1.8
 + datasets==4.1.1
 + dill==0.4.0
 + filelock==3.19.1
 + frozenlist==1.7.0
 + fsspec==2025.9.0
 + ftfy==6.3.1
 + guidellm==0.3.0
 + h11==0.16.0
 + h2==4.3.0
 + hf-xet==1.1.10
 + hpack==4.1.0
 + httpcore==1.0.9
 + httpx==0.28.1
 + huggingface-hub==0.35.0
 + hyperframe==6.1.0
 + idna==3.10
 + loguru==0.7.3
 + markdown-it-py==4.0.0
 + mdurl==0.1.2
 + multidict==6.6.4
 + multiprocess==0.70.16
 + numpy==2.3.3
 + packaging==25.0
 + pandas==2.3.2
 + pillow==11.3.0
 + propcache==0.3.2
 + protobuf==6.32.1
 + pyarrow==21.0.0
 + pydantic==2.11.9
 + pydantic-core==2.33.2
 + pydantic-settings==2.10.1
 + pygments==2.19.2
 + python-dateutil==2.9.0.post0
 + python-dotenv==1.1.1
 + pytz==2025.2
 + pyyaml==6.0.2
 + regex==2025.9.18
 + requests==2.32.5
 + rich==14.1.0
 + safetensors==0.6.2
 + six==1.17.0
 + sniffio==1.3.1
 + tokenizers==0.22.1
 + tqdm==4.67.1
 + transformers==4.56.2
 + typing-extensions==4.15.0
 + typing-inspection==0.4.1
 + tzdata==2025.2
 + urllib3==2.5.0
 + wcwidth==0.2.14
 + xxhash==3.5.0
 + yarl==1.20.1
 Using Python 3.11.13 environment at: /usr/local
 Audited 1 package in 4ms
 Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
 Creating backend...
 Backend openai_http connected to http://llama-stack-benchmark-service:8323/v1/openai for model meta-llama/Llama-3.2-3B-Instruct.
 Creating request loader...
 Created loader with 1000 unique requests from prompt_tokens=512,output_tokens=256.
 ╭─ Benchmarks ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
 │ [17:55:59] ⠋ 100% concurrent@1   (complete)   Req:    0.3 req/s,    3.33s Lat,     1.0 Conc,      18 Comp,        1 Inc,        0 Err                                                                │
 │                                               Tok:   74.0 gen/s,  238.0 tot/s,  49.6ms TTFT,   13.4ms ITL,   546 Prompt,      246 Gen                                                                │
 │ [17:57:04] ⠋ 100% concurrent@2   (complete)   Req:    0.6 req/s,    3.32s Lat,     1.9 Conc,      35 Comp,        2 Inc,        0 Err                                                                │
 │                                               Tok:  137.1 gen/s,  457.5 tot/s,  50.6ms TTFT,   14.0ms ITL,   546 Prompt,      234 Gen                                                                │
 │ [17:58:09] ⠋ 100% concurrent@4   (complete)   Req:    1.2 req/s,    3.42s Lat,     4.0 Conc,      69 Comp,        4 Inc,        0 Err                                                                │
 │                                               Tok:  276.7 gen/s,  907.2 tot/s,  52.7ms TTFT,   14.1ms ITL,   547 Prompt,      240 Gen                                                                │
 │ [17:59:14] ⠋ 100% concurrent@8   (complete)   Req:    2.3 req/s,    3.47s Lat,     7.8 Conc,     134 Comp,        8 Inc,        0 Err                                                                │
 │                                               Tok:  541.4 gen/s, 1775.4 tot/s,  57.3ms TTFT,   14.3ms ITL,   547 Prompt,      240 Gen                                                                │
 │ [18:00:19] ⠋ 100% concurrent@16  (complete)   Req:    4.3 req/s,    3.60s Lat,    15.6 Conc,     259 Comp,       16 Inc,        0 Err                                                                │
 │                                               Tok: 1034.8 gen/s, 3401.7 tot/s,  72.3ms TTFT,   14.8ms ITL,   547 Prompt,      239 Gen                                                                │
 │ [18:01:25] ⠋ 100% concurrent@32  (complete)   Req:    8.4 req/s,    3.69s Lat,    31.1 Conc,     505 Comp,       32 Inc,        0 Err                                                                │
 │                                               Tok: 2029.7 gen/s, 6641.5 tot/s,  91.6ms TTFT,   15.0ms ITL,   547 Prompt,      241 Gen                                                                │
 │ [18:02:31] ⠋ 100% concurrent@64  (complete)   Req:   13.6 req/s,    4.50s Lat,    61.4 Conc,     818 Comp,       64 Inc,        0 Err                                                                │
 │                                               Tok: 3333.9 gen/s, 10787.0 tot/s, 171.3ms TTFT,   17.8ms ITL,   547 Prompt,      244 Gen                                                               │
 │ [18:03:40] ⠋ 100% concurrent@128 (complete)   Req:   16.1 req/s,    7.43s Lat,   119.5 Conc,     964 Comp,      122 Inc,        0 Err                                                                │
 │                                               Tok: 3897.0 gen/s, 12679.4 tot/s, 446.4ms TTFT,   28.9ms ITL,   547 Prompt,      243 Gen                                                               │
 ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 Generating... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ (8/8) [ 0:08:41 < 0:00:00 ]
 Benchmarks Metadata:
    Run id:5393e64f-d9f8-4548-95d8-da320bba1c24
    Duration:530.1 seconds
    Profile:type=concurrent, strategies=['concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent'], streams=[1, 2, 4, 8, 16, 32, 64, 128]
    Args:max_number=None, max_duration=60.0, warmup_number=None, warmup_duration=3.0, cooldown_number=None, cooldown_duration=None
    Worker:type_='generative_requests_worker' backend_type='openai_http' backend_target='http://llama-stack-benchmark-service:8323/v1/openai' backend_model='meta-llama/Llama-3.2-3B-Instruct'
    backend_info={'max_output_tokens': 16384, 'timeout': 300, 'http2': True, 'follow_redirects': True, 'headers': {}, 'text_completions_path': '/v1/completions', 'chat_completions_path':
    '/v1/chat/completions'}
    Request Loader:type_='generative_request_loader' data='prompt_tokens=512,output_tokens=256' data_args=None processor='meta-llama/Llama-3.2-3B-Instruct' processor_args=None
    Extras:None
 Benchmarks Info:
 ===================================================================================================================================================
 Metadata                                       |||| Requests Made  ||| Prompt Tok/Req ||| Output Tok/Req ||| Prompt Tok Total||| Output Tok Total||
     Benchmark| Start Time| End Time| Duration (s)|  Comp|  Inc|  Err|  Comp|   Inc| Err|  Comp|   Inc| Err|   Comp|   Inc| Err|   Comp|   Inc| Err
 --------------|-----------|---------|-------------|------|-----|-----|------|------|----|------|------|----|-------|------|----|-------|------|----
  concurrent@1|   17:56:04| 17:57:04|         60.0|    18|    1|    0| 546.4| 512.0| 0.0| 246.4| 256.0| 0.0|   9836|   512|   0|   4436|   256|   0
  concurrent@2|   17:57:09| 17:58:09|         60.0|    35|    2|    0| 546.4| 512.0| 0.0| 233.9| 132.0| 0.0|  19124|  1024|   0|   8188|   264|   0
  concurrent@4|   17:58:14| 17:59:14|         60.0|    69|    4|    0| 546.6| 512.0| 0.0| 239.9|  60.5| 0.0|  37715|  2048|   0|  16553|   242|   0
  concurrent@8|   17:59:19| 18:00:19|         60.0|   134|    8|    0| 546.6| 512.0| 0.0| 239.8| 126.6| 0.0|  73243|  4096|   0|  32135|  1013|   0
 concurrent@16|   18:00:24| 18:01:24|         60.0|   259|   16|    0| 546.6| 512.0| 0.0| 239.0| 115.7| 0.0| 141561|  8192|   0|  61889|  1851|   0
 concurrent@32|   18:01:30| 18:02:30|         60.0|   505|   32|    0| 546.5| 512.0| 0.0| 240.5| 113.2| 0.0| 275988| 16384|   0| 121466|  3623|   0
 concurrent@64|   18:02:37| 18:03:37|         60.0|   818|   64|    0| 546.6| 512.0| 0.0| 244.5| 132.4| 0.0| 447087| 32768|   0| 199988|  8475|   0
 concurrent@128|   18:03:45| 18:04:45|         60.0|   964|  122|    0| 546.5| 512.0| 0.0| 242.5| 133.1| 0.0| 526866| 62464|   0| 233789| 16241|   0
 ===================================================================================================================================================
 Benchmarks Stats:
 =======================================================================================================================================================
 Metadata      | Request Stats         || Out Tok/sec| Tot Tok/sec| Req Latency (sec)  ||| TTFT (ms)          ||| ITL (ms)        ||| TPOT (ms)       ||
     Benchmark| Per Second| Concurrency|        mean|        mean|  mean|  median|   p99|  mean| median|    p99| mean| median|  p99| mean| median|  p99
 --------------|-----------|------------|------------|------------|------|--------|------|------|-------|-------|-----|-------|-----|-----|-------|-----
  concurrent@1|       0.30|        1.00|        74.0|       238.0|  3.33|    3.44|  3.63|  49.6|   47.2|   66.1| 13.4|   13.3| 14.0| 13.3|   13.3| 14.0
  concurrent@2|       0.59|        1.95|       137.1|       457.5|  3.32|    3.61|  3.67|  50.6|   48.6|   80.4| 14.0|   14.0| 14.2| 13.9|   13.9| 14.1
  concurrent@4|       1.15|        3.95|       276.7|       907.2|  3.42|    3.61|  3.77|  52.7|   49.7|  106.9| 14.1|   14.0| 14.6| 14.0|   13.9| 14.5
  concurrent@8|       2.26|        7.83|       541.4|      1775.4|  3.47|    3.70|  3.79|  57.3|   50.9|  171.3| 14.3|   14.3| 14.4| 14.2|   14.2| 14.4
 concurrent@16|       4.33|       15.57|      1034.8|      3401.7|  3.60|    3.81|  4.22|  72.3|   52.0|  292.9| 14.8|   14.7| 16.3| 14.7|   14.7| 16.3
 concurrent@32|       8.44|       31.12|      2029.7|      6641.5|  3.69|    3.89|  4.24|  91.6|   62.6|  504.6| 15.0|   15.0| 15.4| 14.9|   14.9| 15.4
 concurrent@64|      13.64|       61.40|      3333.9|     10787.0|  4.50|    4.61|  5.67| 171.3|  101.2| 1165.6| 17.8|   17.7| 19.2| 17.7|   17.6| 19.1
 concurrent@128|      16.07|      119.45|      3897.0|     12679.4|  7.43|    7.63|  9.74| 446.4|  195.8| 2533.1| 28.9|   28.9| 31.0| 28.8|   28.8| 30.9
 =======================================================================================================================================================
 Saving benchmarks report...
 Benchmarks report saved to /benchmarks.json
 Benchmarking complete.
--- a/benchmarking/k8s-benchmark/results/guidellm-benchmark-vllm-v1-20250922-111127.txt
+++ b/benchmarking/k8s-benchmark/results/guidellm-benchmark-vllm-v1-20250922-111127.txt
@ -0,0 +1,170 @@
 Collecting uv
  Downloading uv-0.8.19-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
 Downloading uv-0.8.19-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.9 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 20.9/20.9 MB 126.9 MB/s eta 0:00:00
 Installing collected packages: uv
 Successfully installed uv-0.8.19
 WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
 [notice] A new release of pip is available: 24.0 -> 25.2
 [notice] To update, run: pip install --upgrade pip
 Using Python 3.11.13 environment at: /usr/local
 Resolved 61 packages in 561ms
 Downloading hf-xet (3.0MiB)
 Downloading pillow (6.3MiB)
 Downloading transformers (11.1MiB)
 Downloading pyarrow (40.8MiB)
 Downloading numpy (16.2MiB)
 Downloading pandas (11.8MiB)
 Downloading tokenizers (3.1MiB)
 Downloading pydantic-core (1.9MiB)
 Downloading pygments (1.2MiB)
 Downloading aiohttp (1.7MiB)
 Downloading pydantic-core
 Downloading aiohttp
 Downloading tokenizers
 Downloading hf-xet
 Downloading pygments
 Downloading pillow
 Downloading numpy
 Downloading pandas
 Downloading transformers
 Downloading pyarrow
 Prepared 61 packages in 1.25s
 Installed 61 packages in 114ms
 + aiohappyeyeballs==2.6.1
 + aiohttp==3.12.15
 + aiosignal==1.4.0
 + annotated-types==0.7.0
 + anyio==4.10.0
 + attrs==25.3.0
 + certifi==2025.8.3
 + charset-normalizer==3.4.3
 + click==8.1.8
 + datasets==4.1.1
 + dill==0.4.0
 + filelock==3.19.1
 + frozenlist==1.7.0
 + fsspec==2025.9.0
 + ftfy==6.3.1
 + guidellm==0.3.0
 + h11==0.16.0
 + h2==4.3.0
 + hf-xet==1.1.10
 + hpack==4.1.0
 + httpcore==1.0.9
 + httpx==0.28.1
 + huggingface-hub==0.35.0
 + hyperframe==6.1.0
 + idna==3.10
 + loguru==0.7.3
 + markdown-it-py==4.0.0
 + mdurl==0.1.2
 + multidict==6.6.4
 + multiprocess==0.70.16
 + numpy==2.3.3
 + packaging==25.0
 + pandas==2.3.2
 + pillow==11.3.0
 + propcache==0.3.2
 + protobuf==6.32.1
 + pyarrow==21.0.0
 + pydantic==2.11.9
 + pydantic-core==2.33.2
 + pydantic-settings==2.10.1
 + pygments==2.19.2
 + python-dateutil==2.9.0.post0
 + python-dotenv==1.1.1
 + pytz==2025.2
 + pyyaml==6.0.2
 + regex==2025.9.18
 + requests==2.32.5
 + rich==14.1.0
 + safetensors==0.6.2
 + six==1.17.0
 + sniffio==1.3.1
 + tokenizers==0.22.1
 + tqdm==4.67.1
 + transformers==4.56.2
 + typing-extensions==4.15.0
 + typing-inspection==0.4.1
 + tzdata==2025.2
 + urllib3==2.5.0
 + wcwidth==0.2.14
 + xxhash==3.5.0
 + yarl==1.20.1
 Using Python 3.11.13 environment at: /usr/local
 Audited 1 package in 3ms
 Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
 Creating backend...
 Backend openai_http connected to http://vllm-server:8000 for model meta-llama/Llama-3.2-3B-Instruct.
 Creating request loader...
 Created loader with 1000 unique requests from prompt_tokens=512,output_tokens=256.
 ╭─ Benchmarks ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
 │ [18:11:47] ⠋ 100% concurrent@1   (complete)   Req:    0.3 req/s,    3.35s Lat,     1.0 Conc,      17 Comp,        1 Inc,        0 Err                                                                │
 │                                               Tok:   76.4 gen/s,  239.4 tot/s,  29.6ms TTFT,   13.0ms ITL,   547 Prompt,      256 Gen                                                                │
 │ [18:12:52] ⠋ 100% concurrent@2   (complete)   Req:    0.6 req/s,    3.53s Lat,     2.0 Conc,      32 Comp,        2 Inc,        0 Err                                                                │
 │                                               Tok:  145.0 gen/s,  454.5 tot/s,  36.9ms TTFT,   13.7ms ITL,   546 Prompt,      256 Gen                                                                │
 │ [18:13:57] ⠋ 100% concurrent@4   (complete)   Req:    1.1 req/s,    3.59s Lat,     4.0 Conc,      64 Comp,        4 Inc,        0 Err                                                                │
 │                                               Tok:  284.8 gen/s,  892.7 tot/s,  59.0ms TTFT,   13.9ms ITL,   546 Prompt,      256 Gen                                                                │
 │ [18:15:02] ⠋ 100% concurrent@8   (complete)   Req:    2.2 req/s,    3.70s Lat,     8.0 Conc,     128 Comp,        7 Inc,        0 Err                                                                │
 │                                               Tok:  553.5 gen/s, 1735.2 tot/s,  79.8ms TTFT,   14.2ms ITL,   547 Prompt,      256 Gen                                                                │
 │ [18:16:08] ⠋ 100% concurrent@16  (complete)   Req:    4.2 req/s,    3.83s Lat,    16.0 Conc,     240 Comp,       16 Inc,        0 Err                                                                │
 │                                               Tok: 1066.9 gen/s, 3344.6 tot/s,  97.5ms TTFT,   14.6ms ITL,   547 Prompt,      256 Gen                                                                │
 │ [18:17:13] ⠋ 100% concurrent@32  (complete)   Req:    8.1 req/s,    3.94s Lat,    31.8 Conc,     480 Comp,       31 Inc,        0 Err                                                                │
 │                                               Tok: 2069.7 gen/s, 6488.4 tot/s, 120.8ms TTFT,   15.0ms ITL,   547 Prompt,      256 Gen                                                                │
 │ [18:18:20] ⠋ 100% concurrent@64  (complete)   Req:   13.6 req/s,    4.60s Lat,    62.3 Conc,     813 Comp,       57 Inc,        0 Err                                                                │
 │                                               Tok: 3472.1 gen/s, 10884.9 tot/s, 190.9ms TTFT,   17.3ms ITL,   547 Prompt,      256 Gen                                                               │
 │ [18:19:28] ⠋ 100% concurrent@128 (complete)   Req:   16.8 req/s,    7.37s Lat,   123.5 Conc,    1005 Comp,      126 Inc,        0 Err                                                                │
 │                                               Tok: 4289.1 gen/s, 13445.8 tot/s, 356.4ms TTFT,   27.5ms ITL,   547 Prompt,      256 Gen                                                               │
 ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 Generating... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ (8/8) [ 0:08:43 < 0:00:00 ]
 Benchmarks Metadata:
    Run id:8ccb6da1-83f4-4624-8d84-07c723b0b2a5
    Duration:530.4 seconds
    Profile:type=concurrent, strategies=['concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent'], streams=[1, 2, 4, 8, 16, 32, 64, 128]
    Args:max_number=None, max_duration=60.0, warmup_number=None, warmup_duration=3.0, cooldown_number=None, cooldown_duration=None
    Worker:type_='generative_requests_worker' backend_type='openai_http' backend_target='http://vllm-server:8000' backend_model='meta-llama/Llama-3.2-3B-Instruct' backend_info={'max_output_tokens':
    16384, 'timeout': 300, 'http2': True, 'follow_redirects': True, 'headers': {}, 'text_completions_path': '/v1/completions', 'chat_completions_path': '/v1/chat/completions'}
    Request Loader:type_='generative_request_loader' data='prompt_tokens=512,output_tokens=256' data_args=None processor='meta-llama/Llama-3.2-3B-Instruct' processor_args=None
    Extras:None
 Benchmarks Info:
 =====================================================================================================================================================
 Metadata                                       |||| Requests Made  ||| Prompt Tok/Req ||| Output Tok/Req ||| Prompt Tok Total||| Output Tok Total  ||
     Benchmark| Start Time| End Time| Duration (s)|  Comp|  Inc|  Err|  Comp|   Inc| Err|  Comp|   Inc| Err|   Comp|   Inc| Err|    Comp|   Inc|  Err
 --------------|-----------|---------|-------------|------|-----|-----|------|------|----|------|------|----|-------|------|----|--------|------|-----
  concurrent@1|   18:11:52| 18:12:52|         60.0|    17|    1|    0| 546.5| 512.0| 0.0| 256.0| 231.0| 0.0|   9291|   512|   0|    4352|   231|    0
  concurrent@2|   18:12:57| 18:13:57|         60.0|    32|    2|    0| 546.5| 512.0| 0.0| 256.0| 251.0| 0.0|  17488|  1024|   0|    8192|   502|    0
  concurrent@4|   18:14:02| 18:15:02|         60.0|    64|    4|    0| 546.4| 512.0| 0.0| 256.0| 175.2| 0.0|  34972|  2048|   0|   16384|   701|    0
  concurrent@8|   18:15:07| 18:16:07|         60.0|   128|    7|    0| 546.6| 512.0| 0.0| 256.0|  50.7| 0.0|  69966|  3584|   0|   32768|   355|    0
 concurrent@16|   18:16:13| 18:17:13|         60.0|   240|   16|    0| 546.5| 512.0| 0.0| 256.0| 166.0| 0.0| 131170|  8192|   0|   61440|  2656|    0
 concurrent@32|   18:17:18| 18:18:18|         60.0|   480|   31|    0| 546.5| 512.0| 0.0| 256.0|  47.4| 0.0| 262339| 15872|   0|  122880|  1468|    0
 concurrent@64|   18:18:25| 18:19:25|         60.0|   813|   57|    0| 546.5| 512.0| 0.0| 256.0| 110.7| 0.0| 444341| 29184|   0|  208128|  6311|    0
 concurrent@128|   18:19:33| 18:20:33|         60.0|  1005|  126|    0| 546.5| 512.0| 0.0| 256.0|  65.8| 0.0| 549264| 64512|   0|  257280|  8296|    0
 =====================================================================================================================================================
 Benchmarks Stats:
 =======================================================================================================================================================
 Metadata      | Request Stats         || Out Tok/sec| Tot Tok/sec| Req Latency (sec)  ||| TTFT (ms)          ||| ITL (ms)        ||| TPOT (ms)       ||
     Benchmark| Per Second| Concurrency|        mean|        mean|  mean|  median|   p99|  mean| median|    p99| mean| median|  p99| mean| median|  p99
 --------------|-----------|------------|------------|------------|------|--------|------|------|-------|-------|-----|-------|-----|-----|-------|-----
  concurrent@1|       0.30|        1.00|        76.4|       239.4|  3.35|    3.35|  3.38|  29.6|   29.0|   38.9| 13.0|   13.0| 13.1| 13.0|   13.0| 13.0
  concurrent@2|       0.57|        2.00|       145.0|       454.5|  3.53|    3.53|  3.55|  36.9|   39.0|   59.6| 13.7|   13.7| 13.8| 13.6|   13.7| 13.7
  concurrent@4|       1.11|        4.00|       284.8|       892.7|  3.59|    3.59|  3.65|  59.0|   65.7|   88.2| 13.9|   13.8| 14.1| 13.8|   13.8| 14.0
  concurrent@8|       2.16|        7.99|       553.5|      1735.2|  3.70|    3.69|  3.76|  79.8|   80.7|  152.6| 14.2|   14.2| 14.5| 14.1|   14.1| 14.4
 concurrent@16|       4.17|       15.97|      1066.9|      3344.6|  3.83|    3.82|  3.99|  97.5|   96.3|  283.9| 14.6|   14.6| 14.9| 14.6|   14.6| 14.8
 concurrent@32|       8.08|       31.84|      2069.7|      6488.4|  3.94|    3.90|  4.31| 120.8|  101.7|  564.3| 15.0|   14.9| 15.9| 14.9|   14.8| 15.9
 concurrent@64|      13.56|       62.34|      3472.1|     10884.9|  4.60|    4.54|  5.43| 190.9|  133.9| 1113.2| 17.3|   17.2| 18.2| 17.2|   17.2| 18.2
 concurrent@128|      16.75|      123.45|      4289.1|     13445.8|  7.37|    7.21|  9.21| 356.4|  161.9| 2319.9| 27.5|   27.5| 28.8| 27.4|   27.4| 28.7
 =======================================================================================================================================================
 Saving benchmarks report...
 Benchmarks report saved to /benchmarks.json
 Benchmarking complete.
--- a/benchmarking/k8s-benchmark/results/vllm_replica1_benchmark_results.png
+++ b/benchmarking/k8s-benchmark/results/vllm_replica1_benchmark_results.png
--- a/benchmarking/k8s-benchmark/run-benchmark.sh
+++ b/benchmarking/k8s-benchmark/run-benchmark.sh
@ -1,148 +0,0 @@
 #!/usr/bin/env bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 set -euo pipefail
 # Default values
 TARGET="stack"
 DURATION=60
 CONCURRENT=10
 # Parse command line arguments
 usage() {
    echo "Usage: $0 [options]"
    echo "Options:"
    echo "  -t, --target <stack|vllm>     Target to benchmark (default: stack)"
    echo "  -d, --duration <seconds>      Duration in seconds (default: 60)"
    echo "  -c, --concurrent <users>      Number of concurrent users (default: 10)"
    echo "  -h, --help                    Show this help message"
    echo ""
    echo "Examples:"
    echo "  $0 --target vllm              # Benchmark vLLM direct"
    echo "  $0 --target stack             # Benchmark Llama Stack (default)"
    echo "  $0 -t vllm -d 120 -c 20       # vLLM with 120s duration, 20 users"
 }
 while [[ $# -gt 0 ]]; do
    case $1 in
        -t|--target)
            TARGET="$2"
            shift 2
            ;;
        -d|--duration)
            DURATION="$2"
            shift 2
            ;;
        -c|--concurrent)
            CONCURRENT="$2"
            shift 2
            ;;
        -h|--help)
            usage
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            usage
            exit 1
            ;;
    esac
 done
 # Validate target
 if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
    echo "Error: Target must be 'stack' or 'vllm'"
    usage
    exit 1
 fi
 # Set configuration based on target
 if [[ "$TARGET" == "vllm" ]]; then
    BASE_URL="http://vllm-server:8000/v1"
    JOB_NAME="vllm-benchmark-job"
    echo "Benchmarking vLLM direct..."
 else
    BASE_URL="http://llama-stack-benchmark-service:8323/v1/openai/v1"
    JOB_NAME="stack-benchmark-job"
    echo "Benchmarking Llama Stack..."
 fi
 echo "Configuration:"
 echo "  Target: $TARGET"
 echo "  Base URL: $BASE_URL"
 echo "  Duration: ${DURATION}s"
 echo "  Concurrent users: $CONCURRENT"
 echo ""
 # Create temporary job yaml
 TEMP_YAML="/tmp/benchmark-job-temp-$(date +%s).yaml"
 cat > "$TEMP_YAML" << EOF
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: $JOB_NAME
  namespace: default
 spec:
  template:
    spec:
      containers:
      - name: benchmark
        image: python:3.11-slim
        command: ["/bin/bash"]
        args:
        - "-c"
        - |
          pip install aiohttp &&
          python3 /benchmark/benchmark.py \\
            --base-url $BASE_URL \\
            --model \${INFERENCE_MODEL} \\
            --duration $DURATION \\
            --concurrent $CONCURRENT
        env:
        - name: INFERENCE_MODEL
          value: "meta-llama/Llama-3.2-3B-Instruct"
        volumeMounts:
        - name: benchmark-script
          mountPath: /benchmark
        resources:
          requests:
            memory: "256Mi"
            cpu: "250m"
          limits:
            memory: "512Mi"
            cpu: "500m"
      volumes:
      - name: benchmark-script
        configMap:
          name: benchmark-script
      restartPolicy: Never
  backoffLimit: 3
 EOF
 echo "Creating benchmark ConfigMap..."
 kubectl create configmap benchmark-script \
  --from-file=benchmark.py=benchmark.py \
  --dry-run=client -o yaml | kubectl apply -f -
 echo "Cleaning up any existing benchmark job..."
 kubectl delete job $JOB_NAME 2>/dev/null || true
 echo "Deploying benchmark Job..."
 kubectl apply -f "$TEMP_YAML"
 echo "Waiting for job to start..."
 kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=60s
 echo "Following benchmark logs..."
 kubectl logs -f job/$JOB_NAME
 echo "Job completed. Checking final status..."
 kubectl get job $JOB_NAME
 # Clean up temporary file
 rm -f "$TEMP_YAML"
--- a/benchmarking/k8s-benchmark/scripts/generate_charts.py
+++ b/benchmarking/k8s-benchmark/scripts/generate_charts.py
@ -0,0 +1,294 @@
 #!/usr/bin/env python3
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # /// script
 # dependencies = [
 #   "matplotlib",
 # ]
 # ///
 """
 Script to generate benchmark charts from guidellm text results.
 Creates 2x2 grid charts with RPS, Request Latency, TTFT, and ITL metrics against concurrent@x values.
 Outputs one chart file per vLLM replica group, with each line representing one benchmark run.
 """
 import glob
 import os
 import re
 import matplotlib.pyplot as plt
 def extract_setup_name(filename: str) -> str:
    """Extract setup name from filename and format legend appropriately."""
    basename = os.path.basename(filename)
    # Try new pattern: guidellm-benchmark-stack-s{stack_replicas}-sw{workers}-v{vllm_replicas}-{timestamp}.txt
    match = re.search(r"guidellm-benchmark-stack-s(\d+)-sw(\d+)-v(\d+)-(\d{8})-(\d{6})\.txt", basename)
    if match:
        stack_replicas = match.group(1)
        workers = match.group(2)
        vllm_replicas = match.group(3)
        date = match.group(4)
        time = match.group(5)
        return f"stack-s{stack_replicas}-sw{workers}-v{vllm_replicas}"
    # Try new vLLM pattern: guidellm-benchmark-vllm-v{vllm_replicas}-{timestamp}.txt
    match = re.search(r"guidellm-benchmark-vllm-v(\d+)-(\d{8})-(\d{6})\.txt", basename)
    if match:
        vllm_replicas = match.group(1)
        date = match.group(2)
        time = match.group(3)
        return f"vllm-v{vllm_replicas}"
    # Fall back to old pattern: guidellm-benchmark-{target}-{stack_replicas}-w{workers}-{vllm_replicas}-{timestamp}.txt
    match = re.search(r"guidellm-benchmark-([^-]+)-(\d+)-w(\d+)-(\d+)-(\d+)-(\d+)\.txt", basename)
    if match:
        target = match.group(1)
        stack_replicas = match.group(2)
        workers = match.group(3)
        vllm_replicas = match.group(4)
        date = match.group(5)
        time = match.group(6)
        if target == "vllm":
            return f"vllm-{vllm_replicas}-w{workers}-{vllm_replicas}"
        else:
            return f"stack-replicas{stack_replicas}-w{workers}-vllm-replicas{vllm_replicas}-{date}-{time}"
    # Fall back to older pattern: guidellm-benchmark-{target}-{stack_replicas}-{vllm_replicas}-{timestamp}.txt
    match = re.search(r"guidellm-benchmark-([^-]+)-(\d+)-(\d+)-(\d+)-(\d+)\.txt", basename)
    if match:
        target = match.group(1)
        stack_replicas = match.group(2)
        vllm_replicas = match.group(3)
        date = match.group(4)
        time = match.group(5)
        if target == "vllm":
            return f"vllm-{vllm_replicas}-w1-{vllm_replicas}"
        else:
            return f"stack-replicas{stack_replicas}-vllm-replicas{vllm_replicas}-{date}-{time}"
    return basename.replace("guidellm-benchmark-", "").replace(".txt", "")
 def parse_txt_file(filepath: str) -> list[tuple[float, float, float, float, float, str]]:
    """
    Parse a text benchmark file and extract concurrent@x, RPS, TTFT, ITL, and request latency data.
    Returns list of (concurrency, rps_mean, ttft_mean, itl_mean, req_latency_mean, setup_name) tuples.
    """
    setup_name = extract_setup_name(filepath)
    data_points = []
    try:
        with open(filepath) as f:
            content = f.read()
        # Find the benchmark stats table
        lines = content.split("\n")
        in_stats_table = False
        header_lines_seen = 0
        for line in lines:
            line_stripped = line.strip()
            # Look for the start of the stats table
            if "Benchmarks Stats:" in line:
                in_stats_table = True
                continue
            if in_stats_table:
                # Skip the first few separator/header lines
                if line_stripped.startswith("=") or line_stripped.startswith("-"):
                    header_lines_seen += 1
                    if header_lines_seen >= 3:  # After seeing multiple header lines, look for concurrent@ data
                        if line_stripped.startswith("=") and "concurrent@" not in line_stripped:
                            break
                    continue
            # Parse concurrent@ lines in the stats table (may have leading spaces)
            if in_stats_table and "concurrent@" in line:
                parts = [part.strip() for part in line.split("|")]
                if len(parts) >= 12:  # Make sure we have enough columns for new format
                    try:
                        # Extract concurrency from benchmark name (e.g., concurrent@1 -> 1)
                        concurrent_match = re.search(r"concurrent@(\d+)", parts[0])
                        if not concurrent_match:
                            continue
                        concurrency = float(concurrent_match.group(1))
                        # Extract metrics from the new table format
                        # From your image, the table has these columns with | separators:
                        # Benchmark | Per Second | Concurrency | Out Tok/sec | Tot Tok/sec | Req Latency (sec) | TTFT (ms) | ITL (ms) | TPOT (ms)
                        # Looking at the mean/median/p99 structure, need to find the mean columns
                        # The structure shows: mean | median | p99 for each metric
                        rps_mean = float(parts[1])  # Per Second (RPS)
                        req_latency_mean = float(parts[6]) * 1000  # Request latency mean (convert from sec to ms)
                        ttft_mean = float(parts[9])  # TTFT mean column
                        itl_mean = float(parts[12])  # ITL mean column
                        data_points.append((concurrency, rps_mean, ttft_mean, itl_mean, req_latency_mean, setup_name))
                    except (ValueError, IndexError) as e:
                        print(f"Warning: Could not parse line '{line}' in {filepath}: {e}")
                        continue
    except (OSError, FileNotFoundError) as e:
        print(f"Error reading {filepath}: {e}")
    return data_points
 def generate_charts(benchmark_dir: str = "results"):
    """Generate 2x2 grid charts (RPS, Request Latency, TTFT, ITL) from benchmark text files."""
    # Find all text result files instead of JSON
    txt_pattern = os.path.join(benchmark_dir, "guidellm-benchmark-*.txt")
    txt_files = glob.glob(txt_pattern)
    if not txt_files:
        print(f"No text files found matching pattern: {txt_pattern}")
        return
    print(f"Found {len(txt_files)} text files")
    # Parse all files and collect data
    all_data = {}  # setup_name -> [(concurrency, rps, ttft, itl, req_latency), ...]
    for txt_file in txt_files:
        print(f"Processing {txt_file}")
        data_points = parse_txt_file(txt_file)
        for concurrency, rps, ttft, itl, req_latency, setup_name in data_points:
            if setup_name not in all_data:
                all_data[setup_name] = []
            all_data[setup_name].append((concurrency, rps, ttft, itl, req_latency))
    if not all_data:
        print("No data found to plot")
        return
    # Sort data points by concurrency for each setup
    for setup_name in all_data:
        all_data[setup_name].sort(key=lambda x: x[0])  # Sort by concurrency
    # Group setups by vLLM replica number (original approach)
    replica_groups = {}  # vllm_replica_count -> {setup_name: points}
    for setup_name, points in all_data.items():
        # Extract vLLM replica number from setup name
        # Expected formats:
        # - New stack format: "stack-s{X}-sw{W}-v{Y}"
        # - New vLLM format: "vllm-v{Y}"
        # - Old formats: "stack-replicas{X}-w{W}-vllm-replicas{Y}" or "vllm-{Y}-w{W}-{Y}"
        # Try new formats first
        vllm_match = re.search(r"-v(\d+)$", setup_name)  # Matches both "stack-s1-sw2-v3" and "vllm-v1"
        if not vllm_match:
            # Try old stack format
            vllm_match = re.search(r"vllm-replicas(\d+)", setup_name)
        if not vllm_match:
            # Try old vLLM format: "vllm-{Y}-w{W}-{Y}"
            vllm_match = re.search(r"vllm-(\d+)-w\d+-\d+", setup_name)
        if vllm_match:
            vllm_replica_num = int(vllm_match.group(1))
            if vllm_replica_num not in replica_groups:
                replica_groups[vllm_replica_num] = {}
            replica_groups[vllm_replica_num][setup_name] = points
        else:
            print(f"Warning: Could not extract vLLM replica count from setup name: {setup_name}")
    def create_charts(data_dict, prefix, title_prefix):
        """Create a 2x2 grid with RPS, Request Latency, TTFT, and ITL charts."""
        if not data_dict:
            print(f"No data found for {prefix}")
            return
        # Create 2x2 subplot grid
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle(f"{title_prefix} Benchmark Results", fontsize=16, fontweight="bold")
        # Collect all unique concurrency values for tick setting
        all_concurrency_values = set()
        for points in data_dict.values():
            all_concurrency_values.update([p[0] for p in points])
        all_concurrency_values = sorted(all_concurrency_values)
        # Plot data for each setup in alphabetical order
        for setup_name in sorted(data_dict.keys()):
            points = data_dict[setup_name]
            if not points:
                continue
            concurrency_values = [p[0] for p in points]
            rps_values = [p[1] for p in points]
            ttft_values = [p[2] for p in points]
            itl_values = [p[3] for p in points]
            req_latency_values = [p[4] for p in points]
            # RPS chart (top-left)
            ax1.plot(concurrency_values, rps_values, marker="o", label=setup_name, linewidth=2, markersize=6)
            # Request Latency chart (top-right)
            ax2.plot(concurrency_values, req_latency_values, marker="o", label=setup_name, linewidth=2, markersize=6)
            # TTFT chart (bottom-left)
            ax3.plot(concurrency_values, ttft_values, marker="o", label=setup_name, linewidth=2, markersize=6)
            # ITL chart (bottom-right)
            ax4.plot(concurrency_values, itl_values, marker="o", label=setup_name, linewidth=2, markersize=6)
        # Configure all charts after plotting data
        axes = [ax1, ax2, ax3, ax4]
        titles = ["RPS", "Request Latency", "TTFT", "ITL"]
        ylabels = [
            "Requests Per Second (RPS)",
            "Request Latency (ms)",
            "Time to First Token (ms)",
            "Inter Token Latency (ms)",
        ]
        for ax, title, ylabel in zip(axes, titles, ylabels, strict=False):
            ax.set_xlabel("Concurrency", fontsize=12)
            ax.set_ylabel(ylabel, fontsize=12)
            ax.set_title(title, fontsize=14, fontweight="bold")
            ax.set_xscale("log", base=2)
            ax.set_xticks(all_concurrency_values)
            ax.set_xticklabels([str(int(x)) for x in all_concurrency_values])
            ax.grid(True, alpha=0.3)
        # Add legend to the right-most subplot (top-right)
        ax2.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
        plt.tight_layout()
        # Save the combined chart
        combined_filename = os.path.join(benchmark_dir, f"{prefix}_benchmark_results.png")
        plt.savefig(combined_filename, dpi=300, bbox_inches="tight")
        plt.close()
        print(f"Combined benchmark chart saved to {combined_filename}")
    # Print grouping information
    for replica_count, data_dict in replica_groups.items():
        print(f"vLLM Replica {replica_count} setups: {list(data_dict.keys())}")
    # Create separate charts for each replica group
    for replica_count, data_dict in replica_groups.items():
        prefix = f"vllm_replica{replica_count}"
        title = f"vLLM Replicas={replica_count}"
        create_charts(data_dict, prefix, title)
    # Print summary
    print("\nSummary:")
    for setup_name, points in all_data.items():
        print(f"{setup_name}: {len(points)} data points")
 if __name__ == "__main__":
    generate_charts()
--- a/benchmarking/k8s-benchmark/scripts/run-all-benchmarks.sh
+++ b/benchmarking/k8s-benchmark/scripts/run-all-benchmarks.sh
@ -0,0 +1,103 @@
 #!/usr/bin/env bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Define benchmark configurations: (target, stack_replicas, vllm_replicas, stack_workers)
 configs=(
    "stack 1 1 1"
    "stack 1 1 2"
    "stack 1 1 4"
    "vllm 1 1 -"
 )
 set -euo pipefail
 # Get the directory where this script is located
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 echo "Running comprehensive GuideLL benchmark suite..."
 echo "Start time: $(date)"
 # Default deployment names
 STACK_DEPLOYMENT="llama-stack-benchmark-server"
 VLLM_DEPLOYMENT="vllm-server"
 # Scaling function
 scale_deployments() {
    local stack_replicas=$1
    local vllm_replicas=$2
    local workers=$3
    echo "Scaling deployments..."
    if [[ "$vllm_replicas" != "-" ]]; then
        echo "Scaling $VLLM_DEPLOYMENT to $vllm_replicas replicas..."
        kubectl scale deployment $VLLM_DEPLOYMENT --replicas=$vllm_replicas
        kubectl rollout status deployment $VLLM_DEPLOYMENT --timeout=600s
    fi
    if [[ "$target" == "stack" ]]; then
        if [[ "$stack_replicas" != "-" ]]; then
            echo "Scaling $STACK_DEPLOYMENT to $stack_replicas replicas..."
            kubectl scale deployment $STACK_DEPLOYMENT --replicas=$stack_replicas
            kubectl rollout status deployment $STACK_DEPLOYMENT --timeout=600s
        fi
        if [[ "$workers" != "-" ]]; then
            echo "Updating $STACK_DEPLOYMENT to use $workers workers..."
            kubectl set env deployment/$STACK_DEPLOYMENT LLAMA_STACK_WORKERS=$workers
            kubectl rollout status deployment $STACK_DEPLOYMENT --timeout=600s
        fi
    fi
    echo "All scaling operations completed. Waiting additional 30s for services to stabilize..."
    sleep 30
 }
 for config in "${configs[@]}"; do
    read -r target stack_replicas vllm_replicas workers <<< "$config"
    echo ""
    echo "=========================================="
    if [[ "$workers" != "-" ]]; then
        echo "Running benchmark: $target (stack=$stack_replicas, vllm=$vllm_replicas, workers=$workers)"
    else
        echo "Running benchmark: $target (stack=$stack_replicas, vllm=$vllm_replicas)"
    fi
    echo "Start: $(date)"
    echo "=========================================="
    # Scale deployments before running benchmark
    scale_deployments "$stack_replicas" "$vllm_replicas" "$workers"
    # Generate output filename with setup info
    TIMESTAMP=$(date +%Y%m%d-%H%M%S)
    if [[ "$target" == "stack" ]]; then
        OUTPUT_FILE="results/guidellm-benchmark-${target}-s${stack_replicas}-sw${workers}-v${vllm_replicas}-${TIMESTAMP}.txt"
    else
        OUTPUT_FILE="results/guidellm-benchmark-${target}-v${vllm_replicas}-${TIMESTAMP}.txt"
    fi
    # Run the benchmark with the cluster as configured
    "$SCRIPT_DIR/run-guidellm-benchmark.sh" \
        --target "$target" \
        --output-file "$OUTPUT_FILE"
    echo "Completed: $(date)"
    echo "Waiting 30 seconds before next benchmark..."
    sleep 30
 done
 echo ""
 echo "=========================================="
 echo "All benchmarks completed!"
 echo "End time: $(date)"
 echo "=========================================="
 echo ""
 echo "Results files generated:"
 ls -la results/guidellm-*.txt results/guidellm-*.json 2>/dev/null || echo "No result files found"
--- a/benchmarking/k8s-benchmark/scripts/run-guidellm-benchmark.sh
+++ b/benchmarking/k8s-benchmark/scripts/run-guidellm-benchmark.sh
@ -0,0 +1,219 @@
 #!/usr/bin/env bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 set -euo pipefail
 # Default values
 TARGET="stack"
 MAX_SECONDS=60
 PROMPT_TOKENS=512
 OUTPUT_TOKENS=256
 RATE_TYPE="concurrent"
 RATE="1,2,4,8,16,32,64,128"
 STACK_DEPLOYMENT="llama-stack-benchmark-server"
 STACK_URL="http://llama-stack-benchmark-service:8323/v1/openai"
 VLLM_DEPLOYMENT="vllm-server"
 OUTPUT_FILE=""
 # Parse command line arguments
 usage() {
    echo "Usage: $0 [options]"
    echo "Options:"
    echo "  -t, --target <stack|vllm>     Target to benchmark (default: stack)"
    echo "  -s, --max-seconds <seconds>   Maximum duration in seconds (default: 60)"
    echo "  -p, --prompt-tokens <tokens>  Number of prompt tokens (default: 512)"
    echo "  -o, --output-tokens <tokens>  Number of output tokens (default: 256)"
    echo "  -r, --rate-type <type>        Rate type (default: concurrent)"
    echo "  -c, --rate                    Rate (default: 1,2,4,8,16,32,64,128)"
    echo "  --output-file <path>          Output file path (default: auto-generated)"
    echo "  --stack-deployment <name>     Name of the stack deployment (default: llama-stack-benchmark-server)"
    echo "  --vllm-deployment <name>      Name of the vllm deployment (default: vllm-server)"
    echo "  --stack-url <url>             URL of the stack service (default: http://llama-stack-benchmark-service:8323/v1/openai)"
    echo "  -h, --help                    Show this help message"
    echo ""
    echo "Examples:"
    echo "  $0 --target vllm                              # Benchmark vLLM direct"
    echo "  $0 --target stack                             # Benchmark Llama Stack (default)"
    echo "  $0 -t vllm -s 60 -p 512 -o 256               # vLLM with custom parameters"
    echo "  $0 --output-file results/my-benchmark.txt     # Specify custom output file"
    echo "  $0 --stack-deployment my-stack-server         # Use custom stack deployment name"
 }
 while [[ $# -gt 0 ]]; do
    case $1 in
        -t|--target)
            TARGET="$2"
            shift 2
            ;;
        -s|--max-seconds)
            MAX_SECONDS="$2"
            shift 2
            ;;
        -p|--prompt-tokens)
            PROMPT_TOKENS="$2"
            shift 2
            ;;
        -o|--output-tokens)
            OUTPUT_TOKENS="$2"
            shift 2
            ;;
        -r|--rate-type)
            RATE_TYPE="$2"
            shift 2
            ;;
        -c|--rate)
            RATE="$2"
            shift 2
            ;;
        --output-file)
            OUTPUT_FILE="$2"
            shift 2
            ;;
        --stack-deployment)
            STACK_DEPLOYMENT="$2"
            shift 2
            ;;
        --vllm-deployment)
            VLLM_DEPLOYMENT="$2"
            shift 2
            ;;
        --stack-url)
            STACK_URL="$2"
            shift 2
            ;;
        -h|--help)
            usage
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            usage
            exit 1
            ;;
    esac
 done
 # Validate target
 if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
    echo "Error: Target must be 'stack' or 'vllm'"
    usage
    exit 1
 fi
 # Set configuration based on target
 if [[ "$TARGET" == "vllm" ]]; then
    BASE_URL="http://${VLLM_DEPLOYMENT}:8000"
    JOB_NAME="guidellm-vllm-benchmark-job"
    echo "Benchmarking vLLM direct with GuideLLM..."
 else
    BASE_URL="$STACK_URL"
    JOB_NAME="guidellm-stack-benchmark-job"
    echo "Benchmarking Llama Stack with GuideLLM..."
 fi
 echo "Configuration:"
 echo "  Target: $TARGET"
 echo "  Base URL: $BASE_URL"
 echo "  Max seconds: ${MAX_SECONDS}s"
 echo "  Prompt tokens: $PROMPT_TOKENS"
 echo "  Output tokens: $OUTPUT_TOKENS"
 echo "  Rate type: $RATE_TYPE"
 if [[ "$TARGET" == "vllm" ]]; then
    echo "  vLLM deployment: $VLLM_DEPLOYMENT"
 else
    echo "  Stack deployment: $STACK_DEPLOYMENT"
 fi
 echo ""
 # Create temporary job yaml
 TEMP_YAML="/tmp/guidellm-benchmark-job-temp-$(date +%s).yaml"
 cat > "$TEMP_YAML" << EOF
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: $JOB_NAME
  namespace: default
 spec:
  template:
    spec:
      containers:
      - name: guidellm-benchmark
        image: python:3.11-slim
        command: ["/bin/bash"]
        args:
        - "-c"
        - |
          # Install uv and guidellm
          pip install uv &&
          uv pip install --system guidellm &&
          # Login to HuggingFace
          uv pip install --system huggingface_hub &&
          python -c "from huggingface_hub import login; login(token='\$HF_TOKEN')" &&
          # Run GuideLLM benchmark and save output
          export COLUMNS=200
          GUIDELLM__PREFERRED_ROUTE="chat_completions" uv run guidellm benchmark run \\
            --target "$BASE_URL" \\
            --rate-type "$RATE_TYPE" \\
            --max-seconds $MAX_SECONDS \\
            --data "prompt_tokens=$PROMPT_TOKENS,output_tokens=$OUTPUT_TOKENS" \\
            --model "$INFERENCE_MODEL" \\
            --rate "$RATE" \\
            --warmup-percent 0.05 \\
            2>&1
        env:
        - name: INFERENCE_MODEL
          value: "meta-llama/Llama-3.2-3B-Instruct"
        - name: HF_TOKEN
          valueFrom:
            secretKeyRef:
              name: hf-token-secret
              key: token
        resources:
          requests:
            memory: "4Gi"
            cpu: "500m"
          limits:
            memory: "8Gi"
            cpu: "2000m"
      restartPolicy: Never
  backoffLimit: 3
 EOF
 echo "Cleaning up any existing GuideLLM benchmark job..."
 kubectl delete job $JOB_NAME 2>/dev/null || true
 echo "Deploying GuideLLM benchmark Job..."
 kubectl apply -f "$TEMP_YAML"
 echo "Waiting for job to start..."
 kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=120s
 # Prepare file names and create results directory
 mkdir -p results
 if [[ -z "$OUTPUT_FILE" ]]; then
    TIMESTAMP=$(date +%Y%m%d-%H%M%S)
    OUTPUT_FILE="results/guidellm-benchmark-${TARGET}-${TIMESTAMP}.txt"
 fi
 echo "Following GuideLLM benchmark logs..."
 kubectl logs -f job/$JOB_NAME
 echo "Job completed. Checking final status..."
 kubectl get job $JOB_NAME
 # Save benchmark results using kubectl logs
 echo "Saving benchmark results..."
 kubectl logs job/$JOB_NAME > "$OUTPUT_FILE"
 echo "Benchmark output saved to: $OUTPUT_FILE"
 # Clean up temporary file
 rm -f "$TEMP_YAML"
--- a/benchmarking/k8s-benchmark/stack-configmap.yaml
+++ b/benchmarking/k8s-benchmark/stack-configmap.yaml
@ -98,25 +98,34 @@ data:
      - provider_id: model-context-protocol
        provider_type: remote::model-context-protocol
        config: {}
-    metadata_store:
+    storage:
-      type: postgres
+      backends:
        kv_default:
          type: kv_postgres
          host: ${env.POSTGRES_HOST:=localhost}
          port: ${env.POSTGRES_PORT:=5432}
          db: ${env.POSTGRES_DB:=llamastack}
          user: ${env.POSTGRES_USER:=llamastack}
          password: ${env.POSTGRES_PASSWORD:=llamastack}
-      table_name: llamastack_kvstore
+          table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
-    inference_store:
+        sql_default:
-      type: postgres
+          type: sql_postgres
          host: ${env.POSTGRES_HOST:=localhost}
          port: ${env.POSTGRES_PORT:=5432}
          db: ${env.POSTGRES_DB:=llamastack}
          user: ${env.POSTGRES_USER:=llamastack}
          password: ${env.POSTGRES_PASSWORD:=llamastack}
      references:
        metadata:
          backend: kv_default
          namespace: registry
        inference:
          backend: sql_default
          table_name: inference_store
    models:
    - metadata:
-        embedding_dimension: 384
+        embedding_dimension: 768
-      model_id: all-MiniLM-L6-v2
+      model_id: nomic-embed-text-v1.5
      provider_id: sentence-transformers
      model_type: embedding
    - model_id: ${env.INFERENCE_MODEL}
@ -137,5 +146,4 @@ data:
      port: 8323
 kind: ConfigMap
 metadata:
  creationTimestamp: null
  name: llama-stack-config
--- a/benchmarking/k8s-benchmark/stack-k8s.yaml.template
+++ b/benchmarking/k8s-benchmark/stack-k8s.yaml.template
@ -58,14 +58,14 @@ spec:
          value: "/etc/config/stack_run_config.yaml"
        - name: LLAMA_STACK_WORKERS
          value: "${LLAMA_STACK_WORKERS}"
-        command: ["uvicorn", "llama_stack.core.server.server:create_app", "--host", "0.0.0.0", "--port", "8323", "--workers", "$LLAMA_STACK_WORKERS", "--factory"]
+        command: ["uvicorn", "llama_stack.core.server.server:create_app", "--host", "0.0.0.0", "--port", "8323", "--workers", "$(LLAMA_STACK_WORKERS)", "--factory"]
        ports:
          - containerPort: 8323
        resources:
          requests:
-            cpu: "${LLAMA_STACK_WORKERS}"
+            cpu: "4"
          limits:
-            cpu: "${LLAMA_STACK_WORKERS}"
+            cpu: "4"
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.llama
--- a/benchmarking/k8s-benchmark/stack_run_config.yaml
+++ b/benchmarking/k8s-benchmark/stack_run_config.yaml
@ -6,7 +6,6 @@ apis:
 - inference
 - files
 - safety
 - telemetry
 - tool_runtime
 - vector_io
 providers:
@ -27,28 +26,24 @@ providers:
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
      metadata_store:
-        type: sqlite
+        table_name: files_metadata
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+        backend: sql_default
  vector_io:
  - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
-      kvstore:
+      persistence:
-        type: postgres
+        namespace: vector_io::chroma_remote
-        host: ${env.POSTGRES_HOST:=localhost}
+        backend: kv_default
        port: ${env.POSTGRES_PORT:=5432}
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
      metadata_store:
-        type: sqlite
+        table_name: files_metadata
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+        backend: sql_default
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
@ -58,26 +53,15 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      persistence_store:
+      persistence:
-        type: postgres
+        agent_state:
-        host: ${env.POSTGRES_HOST:=localhost}
+          namespace: agents
-        port: ${env.POSTGRES_PORT:=5432}
+          backend: kv_default
-        db: ${env.POSTGRES_DB:=llamastack}
+        responses:
-        user: ${env.POSTGRES_USER:=llamastack}
+          table_name: responses
-        password: ${env.POSTGRES_PASSWORD:=llamastack}
+          backend: sql_default
-      responses_store:
+          max_write_queue_size: 10000
-        type: postgres
+          num_writers: 4
        host: ${env.POSTGRES_HOST:=localhost}
        port: ${env.POSTGRES_PORT:=5432}
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
      sinks: ${env.TELEMETRY_SINKS:=console}
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
@ -95,25 +79,40 @@ providers:
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
    config: {}
-metadata_store:
+storage:
-  type: postgres
+  backends:
    kv_default:
      type: kv_postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
-  table_name: llamastack_kvstore
+      table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
-inference_store:
+    sql_default:
-  type: postgres
+      type: sql_postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
  stores:
    metadata:
      namespace: registry
      backend: kv_default
    inference:
      table_name: inference_store
      backend: sql_default
      max_write_queue_size: 10000
      num_writers: 4
    conversations:
      table_name: openai_conversations
      backend: sql_default
 registered_resources:
  models:
  - metadata:
-    embedding_dimension: 384
+      embedding_dimension: 768
-  model_id: all-MiniLM-L6-v2
+    model_id: nomic-embed-text-v1.5
    provider_id: sentence-transformers
    model_type: embedding
  - model_id: ${env.INFERENCE_MODEL}
@ -132,3 +131,10 @@ tool_groups:
    provider_id: rag-runtime
 server:
  port: 8323
 telemetry:
  enabled: true
 vector_stores:
  default_provider_id: chromadb
  default_embedding_model:
    provider_id: sentence-transformers
    model_id: nomic-ai/nomic-embed-text-v1.5
--- a/client-sdks/stainless/README.md
+++ b/client-sdks/stainless/README.md
@ -0,0 +1,8 @@
 These are the source-of-truth configuration files used to generate the Stainless client SDKs via Stainless.
 - `openapi.yml`: this is the OpenAPI specification for the Llama Stack API.
 - `openapi.stainless.yml`: this is the Stainless _configuration_ which instructs Stainless how to generate the client SDKs.
 A small side note: notice the `.yml` suffixes since Stainless uses that suffix typically for its configuration files.
 These files go hand-in-hand. As of now, only the `openapi.yml` file is automatically generated using the `run_openapi_generator.sh` script.
--- a/client-sdks/stainless/openapi.stainless.yml
+++ b/client-sdks/stainless/openapi.stainless.yml
@ -0,0 +1,610 @@
 # yaml-language-server: $schema=https://app.stainlessapi.com/config-internal.schema.json
 organization:
  # Name of your organization or company, used to determine the name of the client
  # and headings.
  name: llama-stack-client
  docs: https://llama-stack.readthedocs.io/en/latest/
  contact: llamastack@meta.com
 security:
  - {}
  - BearerAuth: []
 security_schemes:
  BearerAuth:
    type: http
    scheme: bearer
 # `targets` define the output targets and their customization options, such as
 # whether to emit the Node SDK and what it's package name should be.
 targets:
  node:
    package_name: llama-stack-client
    production_repo: llamastack/llama-stack-client-typescript
    publish:
      npm: false
  python:
    package_name: llama_stack_client
    production_repo: llamastack/llama-stack-client-python
    options:
      use_uv: true
    publish:
      pypi: true
    project_name: llama_stack_client
  kotlin:
    reverse_domain: com.llama_stack_client.api
    production_repo: null
    publish:
      maven: false
  go:
    package_name: llama-stack-client
    production_repo: llamastack/llama-stack-client-go
    options:
      enable_v2: true
      back_compat_use_shared_package: false
 # `client_settings` define settings for the API client, such as extra constructor
 # arguments (used for authentication), retry behavior, idempotency, etc.
 client_settings:
  default_env_prefix: LLAMA_STACK_CLIENT
  opts:
    api_key:
      type: string
      read_env: LLAMA_STACK_CLIENT_API_KEY
      auth: { security_scheme: BearerAuth }
      nullable: true
 # `environments` are a map of the name of the environment (e.g. "sandbox",
 # "production") to the corresponding url to use.
 environments:
  production: http://any-hosted-llama-stack.com
 # `pagination` defines [pagination schemes] which provides a template to match
 # endpoints and generate next-page and auto-pagination helpers in the SDKs.
 pagination:
  - name: datasets_iterrows
    type: offset
    request:
      dataset_id:
        type: string
      start_index:
        type: integer
        x-stainless-pagination-property:
          purpose: offset_count_param
      limit:
        type: integer
    response:
      data:
        type: array
        items:
          type: object
      next_index:
        type: integer
        x-stainless-pagination-property:
          purpose: offset_count_start_field
  - name: openai_cursor_page
    type: cursor
    request:
      limit:
        type: integer
      after:
        type: string
        x-stainless-pagination-property:
          purpose: next_cursor_param
    response:
      data:
        type: array
        items: {}
      has_more:
        type: boolean
      last_id:
        type: string
        x-stainless-pagination-property:
          purpose: next_cursor_field
 # `resources` define the structure and organziation for your API, such as how
 # methods and models are grouped together and accessed. See the [configuration
 # guide] for more information.
 #
 # [configuration guide]:
 #   https://app.stainlessapi.com/docs/guides/configure#resources
 resources:
  $shared:
    models:
      agent_config: AgentConfig
      interleaved_content_item: InterleavedContentItem
      interleaved_content: InterleavedContent
      param_type: ParamType
      safety_violation: SafetyViolation
      sampling_params: SamplingParams
      scoring_result: ScoringResult
      message: Message
      user_message: UserMessage
      completion_message: CompletionMessage
      tool_response_message: ToolResponseMessage
      system_message: SystemMessage
      tool_call: ToolCall
      query_result: RAGQueryResult
      document: RAGDocument
      query_config: RAGQueryConfig
      response_format: ResponseFormat
  toolgroups:
    models:
      tool_group: ToolGroup
      list_tool_groups_response: ListToolGroupsResponse
    methods:
      register: post /v1/toolgroups
      get: get /v1/toolgroups/{toolgroup_id}
      list: get /v1/toolgroups
      unregister: delete /v1/toolgroups/{toolgroup_id}
  tools:
    methods:
      get: get /v1/tools/{tool_name}
      list:
        endpoint: get /v1/tools
        paginated: false
  tool_runtime:
    models:
      tool_def: ToolDef
      tool_invocation_result: ToolInvocationResult
    methods:
      list_tools:
        endpoint: get /v1/tool-runtime/list-tools
        paginated: false
      invoke_tool: post /v1/tool-runtime/invoke
    subresources:
      rag_tool:
        methods:
          insert: post /v1/tool-runtime/rag-tool/insert
          query: post /v1/tool-runtime/rag-tool/query
  responses:
    models:
      response_object_stream: OpenAIResponseObjectStream
      response_object: OpenAIResponseObject
    methods:
      create:
        type: http
        endpoint: post /v1/responses
        streaming:
          stream_event_model: responses.response_object_stream
          param_discriminator: stream
      retrieve: get /v1/responses/{response_id}
      list:
        type: http
        endpoint: get /v1/responses
      delete:
        type: http
        endpoint: delete /v1/responses/{response_id}
    subresources:
      input_items:
        methods:
          list:
            type: http
            endpoint: get /v1/responses/{response_id}/input_items
  conversations:
    models:
      conversation_object: Conversation
    methods:
      create:
        type: http
        endpoint: post /v1/conversations
      retrieve: get /v1/conversations/{conversation_id}
      update:
        type: http
        endpoint: post /v1/conversations/{conversation_id}
      delete:
        type: http
        endpoint: delete /v1/conversations/{conversation_id}
    subresources:
      items:
        methods:
          get:
            type: http
            endpoint: get /v1/conversations/{conversation_id}/items/{item_id}
          list:
            type: http
            endpoint: get /v1/conversations/{conversation_id}/items
          create:
            type: http
            endpoint: post /v1/conversations/{conversation_id}/items
  inspect:
    models:
      healthInfo: HealthInfo
      providerInfo: ProviderInfo
      routeInfo: RouteInfo
      versionInfo: VersionInfo
    methods:
      health: get /v1/health
      version: get /v1/version
  embeddings:
    models:
      create_embeddings_response: OpenAIEmbeddingsResponse
    methods:
      create: post /v1/embeddings
  chat:
    models:
      chat_completion_chunk: OpenAIChatCompletionChunk
    subresources:
      completions:
        methods:
          create:
            type: http
            endpoint: post /v1/chat/completions
            streaming:
              stream_event_model: chat.chat_completion_chunk
              param_discriminator: stream
          list:
            type: http
            endpoint: get /v1/chat/completions
          retrieve:
            type: http
            endpoint: get /v1/chat/completions/{completion_id}
  completions:
    methods:
      create:
        type: http
        endpoint: post /v1/completions
        streaming:
          param_discriminator: stream
  vector_io:
    models:
      queryChunksResponse: QueryChunksResponse
    methods:
      insert: post /v1/vector-io/insert
      query: post /v1/vector-io/query
  vector_stores:
    models:
      vector_store: VectorStoreObject
      list_vector_stores_response: VectorStoreListResponse
      vector_store_delete_response: VectorStoreDeleteResponse
      vector_store_search_response: VectorStoreSearchResponsePage
    methods:
      create: post /v1/vector_stores
      list:
        endpoint: get /v1/vector_stores
      retrieve: get /v1/vector_stores/{vector_store_id}
      update: post /v1/vector_stores/{vector_store_id}
      delete: delete /v1/vector_stores/{vector_store_id}
      search: post /v1/vector_stores/{vector_store_id}/search
    subresources:
      files:
        models:
          vector_store_file: VectorStoreFileObject
        methods:
          list: get /v1/vector_stores/{vector_store_id}/files
          retrieve: get /v1/vector_stores/{vector_store_id}/files/{file_id}
          update: post /v1/vector_stores/{vector_store_id}/files/{file_id}
          delete: delete /v1/vector_stores/{vector_store_id}/files/{file_id}
          create: post /v1/vector_stores/{vector_store_id}/files
          content: get /v1/vector_stores/{vector_store_id}/files/{file_id}/content
      file_batches:
        models:
          vector_store_file_batches: VectorStoreFileBatchObject
          list_vector_store_files_in_batch_response: VectorStoreFilesListInBatchResponse
        methods:
          create: post /v1/vector_stores/{vector_store_id}/file_batches
          retrieve: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}
          list_files: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/files
          cancel: post /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel
  models:
    models:
      model: Model
      list_models_response: ListModelsResponse
    methods:
      retrieve: get /v1/models/{model_id}
      list:
        endpoint: get /v1/models
        paginated: false
      register: post /v1/models
      unregister: delete /v1/models/{model_id}
    subresources:
      openai:
        methods:
          list:
            endpoint: get /v1/models
            paginated: false
  providers:
    models:
      list_providers_response: ListProvidersResponse
    methods:
      list:
        endpoint: get /v1/providers
        paginated: false
      retrieve: get /v1/providers/{provider_id}
  routes:
    models:
      list_routes_response: ListRoutesResponse
    methods:
      list:
        endpoint: get /v1/inspect/routes
        paginated: false
  moderations:
    models:
      create_response: ModerationObject
    methods:
      create: post /v1/moderations
  safety:
    models:
      run_shield_response: RunShieldResponse
    methods:
      run_shield: post /v1/safety/run-shield
  shields:
    models:
      shield: Shield
      list_shields_response: ListShieldsResponse
    methods:
      retrieve: get /v1/shields/{identifier}
      list:
        endpoint: get /v1/shields
        paginated: false
      register: post /v1/shields
      delete: delete /v1/shields/{identifier}
  synthetic_data_generation:
    models:
      syntheticDataGenerationResponse: SyntheticDataGenerationResponse
    methods:
      generate: post /v1/synthetic-data-generation/generate
  telemetry:
    models:
      span_with_status: SpanWithStatus
      trace: Trace
      query_spans_response: QuerySpansResponse
      event: Event
      query_condition: QueryCondition
    methods:
      query_traces:
        endpoint: post /v1alpha/telemetry/traces
        skip_test_reason: 'unsupported query params in java / kotlin'
      get_span_tree: post /v1alpha/telemetry/spans/{span_id}/tree
      query_spans:
        endpoint: post /v1alpha/telemetry/spans
        skip_test_reason: 'unsupported query params in java / kotlin'
      query_metrics:
        endpoint: post /v1alpha/telemetry/metrics/{metric_name}
        skip_test_reason: 'unsupported query params in java / kotlin'
      # log_event: post /v1alpha/telemetry/events
      save_spans_to_dataset: post /v1alpha/telemetry/spans/export
      get_span: get /v1alpha/telemetry/traces/{trace_id}/spans/{span_id}
      get_trace: get /v1alpha/telemetry/traces/{trace_id}
  scoring:
    methods:
      score: post /v1/scoring/score
      score_batch: post /v1/scoring/score-batch
  scoring_functions:
    methods:
      retrieve: get /v1/scoring-functions/{scoring_fn_id}
      list:
        endpoint: get /v1/scoring-functions
        paginated: false
      register: post /v1/scoring-functions
    models:
      scoring_fn: ScoringFn
      scoring_fn_params: ScoringFnParams
      list_scoring_functions_response: ListScoringFunctionsResponse
  benchmarks:
    methods:
      retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}
      list:
        endpoint: get /v1alpha/eval/benchmarks
        paginated: false
      register: post /v1alpha/eval/benchmarks
    models:
      benchmark: Benchmark
      list_benchmarks_response: ListBenchmarksResponse
  files:
    methods:
      create: post /v1/files
      list: get /v1/files
      retrieve: get /v1/files/{file_id}
      delete: delete /v1/files/{file_id}
      content: get /v1/files/{file_id}/content
    models:
      file: OpenAIFileObject
      list_files_response: ListOpenAIFileResponse
      delete_file_response: OpenAIFileDeleteResponse
  alpha:
    subresources:
      inference:
        methods:
          rerank: post /v1alpha/inference/rerank
      post_training:
        models:
          algorithm_config: AlgorithmConfig
          post_training_job: PostTrainingJob
          list_post_training_jobs_response: ListPostTrainingJobsResponse
        methods:
          preference_optimize: post /v1alpha/post-training/preference-optimize
          supervised_fine_tune: post /v1alpha/post-training/supervised-fine-tune
        subresources:
          job:
            methods:
              artifacts: get /v1alpha/post-training/job/artifacts
              cancel: post /v1alpha/post-training/job/cancel
              status: get /v1alpha/post-training/job/status
              list:
                endpoint: get /v1alpha/post-training/jobs
                paginated: false
      eval:
        methods:
          evaluate_rows: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
          run_eval: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
          evaluate_rows_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
          run_eval_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
        subresources:
          jobs:
            methods:
              cancel: delete /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
              status: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
              retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result
        models:
          evaluate_response: EvaluateResponse
          benchmark_config: BenchmarkConfig
          job: Job
      agents:
        methods:
          create: post /v1alpha/agents
          list: get /v1alpha/agents
          retrieve: get /v1alpha/agents/{agent_id}
          delete: delete /v1alpha/agents/{agent_id}
        models:
          inference_step: InferenceStep
          tool_execution_step: ToolExecutionStep
          tool_response: ToolResponse
          shield_call_step: ShieldCallStep
          memory_retrieval_step: MemoryRetrievalStep
        subresources:
          session:
            models:
              session: Session
            methods:
              list: get /v1alpha/agents/{agent_id}/sessions
              create: post /v1alpha/agents/{agent_id}/session
              delete: delete /v1alpha/agents/{agent_id}/session/{session_id}
              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}
          steps:
            methods:
              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}
          turn:
            models:
              turn: Turn
              turn_response_event: AgentTurnResponseEvent
              agent_turn_response_stream_chunk: AgentTurnResponseStreamChunk
            methods:
              create:
                type: http
                endpoint: post /v1alpha/agents/{agent_id}/session/{session_id}/turn
                streaming:
                  stream_event_model: alpha.agents.turn.agent_turn_response_stream_chunk
                  param_discriminator: stream
              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}
              resume:
                type: http
                endpoint: post /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume
                streaming:
                  stream_event_model: alpha.agents.turn.agent_turn_response_stream_chunk
                  param_discriminator: stream
  beta:
    subresources:
      datasets:
        models:
          list_datasets_response: ListDatasetsResponse
        methods:
          register: post /v1beta/datasets
          retrieve: get /v1beta/datasets/{dataset_id}
          list:
            endpoint: get /v1beta/datasets
            paginated: false
          unregister: delete /v1beta/datasets/{dataset_id}
          iterrows: get /v1beta/datasetio/iterrows/{dataset_id}
          appendrows: post /v1beta/datasetio/append-rows/{dataset_id}
 settings:
  license: MIT
  unwrap_response_fields: [ data ]
 openapi:
  transformations:
    - command: renameValue
      reason: pydantic reserved name
      args:
        filter:
          only:
            - '$.components.schemas.InferenceStep.properties.model_response'
        rename:
          python:
            property_name: 'inference_model_response'
    # - command: renameValue
    #   reason: pydantic reserved name
    #   args:
    #     filter:
    #       only:
    #         - '$.components.schemas.Model.properties.model_type'
    #     rename:
    #       python:
    #         property_name: 'type'
    - command: mergeObject
      reason: Better return_type using enum
      args:
        target:
          - '$.components.schemas'
        object:
          ReturnType:
            additionalProperties: false
            properties:
              type:
                enum:
                  - string
                  - number
                  - boolean
                  - array
                  - object
                  - json
                  - union
                  - chat_completion_input
                  - completion_input
                  - agent_turn_input
            required:
              - type
            type: object
    - command: replaceProperties
      reason: Replace return type properties with better model (see above)
      args:
        filter:
          only:
            - '$.components.schemas.ScoringFn.properties.return_type'
            - '$.components.schemas.RegisterScoringFunctionRequest.properties.return_type'
        value:
          $ref: '#/components/schemas/ReturnType'
    - command: oneOfToAnyOf
      reason: Prism (mock server) doesn't like one of our requests as it technically matches multiple variants
    - reason: For better names
      command: extractToRefs
      args:
        ref:
          target: '$.components.schemas.ToolCallDelta.properties.tool_call'
          name: '#/components/schemas/ToolCallOrString'
 # `readme` is used to configure the code snippets that will be rendered in the
 # README.md of various SDKs. In particular, you can change the `headline`
 # snippet's endpoint and the arguments to call it with.
 readme:
  example_requests:
    default:
      type: request
      endpoint: post /v1/chat/completions
      params: &ref_0 {}
    headline:
      type: request
      endpoint: post /v1/models
      params: *ref_0
    pagination:
      type: request
      endpoint: post /v1/chat/completions
      params: {}
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
--- a/containers/Containerfile
+++ b/containers/Containerfile
@ -0,0 +1,137 @@
 # syntax=docker/dockerfile:1.6
 #
 # This Dockerfile is used to build the Llama Stack container image.
 # Example:
 # docker build \
 #   -f containers/Containerfile \
 #   --build-arg DISTRO_NAME=starter \
 #   --tag llama-stack:starter .
 ARG BASE_IMAGE=python:3.12-slim
 FROM ${BASE_IMAGE}
 ARG INSTALL_MODE="pypi"
 ARG LLAMA_STACK_DIR="/workspace"
 ARG LLAMA_STACK_CLIENT_DIR=""
 ARG PYPI_VERSION=""
 ARG TEST_PYPI_VERSION=""
 ARG KEEP_WORKSPACE=""
 ARG DISTRO_NAME="starter"
 ARG RUN_CONFIG_PATH=""
 ARG UV_HTTP_TIMEOUT=500
 ENV UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT}
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PIP_DISABLE_PIP_VERSION_CHECK=1
 WORKDIR /app
 RUN set -eux; \
    if command -v dnf >/dev/null 2>&1; then \
        dnf -y update && \
        dnf install -y iputils git net-tools wget \
            vim-minimal python3.12 python3.12-pip python3.12-wheel \
            python3.12-setuptools python3.12-devel gcc gcc-c++ make && \
        ln -sf /usr/bin/pip3.12 /usr/local/bin/pip && \
        ln -sf /usr/bin/python3.12 /usr/local/bin/python && \
        dnf clean all; \
    elif command -v apt-get >/dev/null 2>&1; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            iputils-ping net-tools iproute2 dnsutils telnet \
            curl wget git procps psmisc lsof traceroute bubblewrap \
            gcc g++ && \
        rm -rf /var/lib/apt/lists/*; \
    else \
        echo "Unsupported base image: expected dnf or apt-get" >&2; \
        exit 1; \
    fi
 RUN pip install --no-cache uv
 ENV UV_SYSTEM_PYTHON=1
 ENV INSTALL_MODE=${INSTALL_MODE}
 ENV LLAMA_STACK_DIR=${LLAMA_STACK_DIR}
 ENV LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR}
 ENV PYPI_VERSION=${PYPI_VERSION}
 ENV TEST_PYPI_VERSION=${TEST_PYPI_VERSION}
 ENV KEEP_WORKSPACE=${KEEP_WORKSPACE}
 ENV DISTRO_NAME=${DISTRO_NAME}
 ENV RUN_CONFIG_PATH=${RUN_CONFIG_PATH}
 # Copy the repository so editable installs and run configurations are available.
 COPY . /workspace
 # Install the client package if it is provided
 # NOTE: this is installed before llama-stack since llama-stack depends on llama-stack-client-python
 RUN set -eux; \
    if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then \
        if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then \
            echo "LLAMA_STACK_CLIENT_DIR is set but $LLAMA_STACK_CLIENT_DIR does not exist" >&2; \
            exit 1; \
        fi; \
        uv pip install --no-cache -e "$LLAMA_STACK_CLIENT_DIR"; \
    fi;
 # Install llama-stack
 RUN set -eux; \
    if [ "$INSTALL_MODE" = "editable" ]; then \
        if [ ! -d "$LLAMA_STACK_DIR" ]; then \
            echo "INSTALL_MODE=editable requires LLAMA_STACK_DIR to point to a directory inside the build context" >&2; \
            exit 1; \
        fi; \
        uv pip install --no-cache -e "$LLAMA_STACK_DIR"; \
    elif [ "$INSTALL_MODE" = "test-pypi" ]; then \
        uv pip install --no-cache fastapi libcst; \
        if [ -n "$TEST_PYPI_VERSION" ]; then \
            uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match "llama-stack==$TEST_PYPI_VERSION"; \
        else \
            uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match llama-stack; \
        fi; \
    else \
        if [ -n "$PYPI_VERSION" ]; then \
            uv pip install --no-cache "llama-stack==$PYPI_VERSION"; \
        else \
            uv pip install --no-cache llama-stack; \
        fi; \
    fi;
 # Install the dependencies for the distribution
 RUN set -eux; \
    if [ -z "$DISTRO_NAME" ]; then \
        echo "DISTRO_NAME must be provided" >&2; \
        exit 1; \
    fi; \
    deps="$(llama stack list-deps "$DISTRO_NAME")"; \
    if [ -n "$deps" ]; then \
        printf '%s\n' "$deps" | xargs -L1 uv pip install --no-cache; \
    fi
 # Cleanup
 RUN set -eux; \
    pip uninstall -y uv; \
    should_remove=1; \
    if [ -n "$KEEP_WORKSPACE" ]; then should_remove=0; fi; \
    if [ "$INSTALL_MODE" = "editable" ]; then should_remove=0; fi; \
    case "$RUN_CONFIG_PATH" in \
        /workspace*) should_remove=0 ;; \
    esac; \
    if [ "$should_remove" -eq 1 ] && [ -d /workspace ]; then rm -rf /workspace; fi
 RUN cat <<'EOF' >/usr/local/bin/llama-stack-entrypoint.sh
 #!/bin/sh
 set -e
 if [ -n "$RUN_CONFIG_PATH" ] && [ -f "$RUN_CONFIG_PATH" ]; then
  exec llama stack run "$RUN_CONFIG_PATH" "$@"
 fi
 if [ -n "$DISTRO_NAME" ]; then
  exec llama stack run "$DISTRO_NAME" "$@"
 fi
 exec llama stack run "$@"
 EOF
 RUN chmod +x /usr/local/bin/llama-stack-entrypoint.sh
 RUN mkdir -p /.llama /.cache && chmod -R g+rw /app /.llama /.cache
 ENTRYPOINT ["/usr/local/bin/llama-stack-entrypoint.sh"]
--- a/docs/Makefile
+++ b/docs/Makefile
@ -1,20 +0,0 @@
 # Minimal makefile for Sphinx documentation
 #
 # You can set these variables from the command line, and also
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = source
 BUILDDIR      = _build
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 .PHONY: help Makefile
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/README.md
+++ b/docs/README.md
@ -1,14 +1,53 @@
 # Llama Stack Documentation
-Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [Github page](https://llamastack.github.io/latest/getting_started/index.html).
+Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [Github page](https://llamastack.github.io/getting_started/quickstart).
 ## Render locally
-From the llama-stack root directory, run the following command to render the docs locally:
+From the llama-stack `docs/` directory, run the following commands to render the docs locally:
 ```bash
-uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
+npm install
 npm run gen-api-docs all
 npm run build
 npm run serve
 ```
-You can open up the docs in your browser at http://localhost:8000
+You can open up the docs in your browser at http://localhost:3000
 ## File Import System
 This documentation uses `remark-code-import` to import files directly from the repository, eliminating copy-paste maintenance. Files are automatically embedded during build time.
 ### Importing Code Files
 To import Python code (or any code files) with syntax highlighting, use this syntax in `.mdx` files:
 ```markdown
 ```python file=./demo_script.py title="demo_script.py"
 ```
 ```
 This automatically imports the file content and displays it as a formatted code block with Python syntax highlighting.
 **Note:** Paths are relative to the current `.mdx` file location, not the repository root.
 ### Importing Markdown Files as Content
 For importing and rendering markdown files (like CONTRIBUTING.md), use the raw-loader approach:
 ```jsx
 import Contributing from '!!raw-loader!../../../CONTRIBUTING.md';
 import ReactMarkdown from 'react-markdown';
 <ReactMarkdown>{Contributing}</ReactMarkdown>
 ```
 **Requirements:**
 - Install dependencies: `npm install --save-dev raw-loader react-markdown`
 **Path Resolution:**
 - For `remark-code-import`: Paths are relative to the current `.mdx` file location
 - For `raw-loader`: Paths are relative to the current `.mdx` file location
 - Use `../` to navigate up directories as needed
 ## Content
--- a/docs/_static/css/my_theme.css
+++ b/docs/_static/css/my_theme.css
@ -1,136 +0,0 @@
@import url("theme.css");
 /* Horizontal Navigation Bar */
 .horizontal-nav {
    background-color: #ffffff;
    border-bottom: 1px solid #e5e5e5;
    padding: 0;
    position: fixed;
    top: 0;
    left: 0;
    right: 0;
    z-index: 1050;
    height: 50px;
    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
 }
 [data-theme="dark"] .horizontal-nav {
    background-color: #1a1a1a;
    border-bottom: 1px solid #333;
 }
 .horizontal-nav .nav-container {
    max-width: 1200px;
    margin: 0 auto;
    display: flex;
    align-items: center;
    justify-content: space-between;
    padding: 0 20px;
    height: 100%;
 }
 .horizontal-nav .nav-brand {
    font-size: 18px;
    font-weight: 600;
    color: #333;
    text-decoration: none;
 }
 [data-theme="dark"] .horizontal-nav .nav-brand {
    color: #fff;
 }
 .horizontal-nav .nav-links {
    display: flex;
    align-items: center;
    gap: 30px;
    list-style: none;
    margin: 0;
    padding: 0;
 }
 .horizontal-nav .nav-links a {
    color: #666;
    text-decoration: none;
    font-size: 14px;
    font-weight: 500;
    padding: 8px 12px;
    border-radius: 6px;
    transition: all 0.2s ease;
 }
 .horizontal-nav .nav-links a:hover,
 .horizontal-nav .nav-links a.active {
    color: #333;
    background-color: #f5f5f5;
 }
 .horizontal-nav .nav-links a.active {
    font-weight: 600;
 }
 [data-theme="dark"] .horizontal-nav .nav-links a {
    color: #ccc;
 }
 [data-theme="dark"] .horizontal-nav .nav-links a:hover,
 [data-theme="dark"] .horizontal-nav .nav-links a.active {
    color: #fff;
    background-color: #333;
 }
 .horizontal-nav .nav-links .github-link {
    display: flex;
    align-items: center;
    gap: 6px;
 }
 .horizontal-nav .nav-links .github-icon {
    width: 16px;
    height: 16px;
    fill: currentColor;
 }
 /* Adjust main content to account for fixed nav */
 .wy-nav-side {
    top: 50px;
    height: calc(100vh - 50px);
 }
 .wy-nav-content-wrap {
    margin-top: 50px;
 }
 .wy-nav-content {
    max-width: 90%;
 }
 .wy-nav-side {
    /* background: linear-gradient(45deg, #2980B9, #16A085); */
    background: linear-gradient(90deg, #332735, #1b263c);
 }
 .wy-side-nav-search {
    background-color: transparent !important;
 }
 .hide-title h1 {
    display: none;
 }
 h2, h3, h4 {
    font-weight: normal;
 }
 html[data-theme="dark"] .rst-content div[class^="highlight"] {
  background-color: #0b0b0b;
 }
 pre {
    white-space: pre-wrap !important;
    word-break: break-all;
 }
 [data-theme="dark"] .mermaid {
    background-color: #f4f4f6 !important;
    border-radius: 6px;
    padding: 0.5em;
  }
--- a/docs/_static/js/detect_theme.js
+++ b/docs/_static/js/detect_theme.js
@ -1,32 +0,0 @@
 document.addEventListener("DOMContentLoaded", function () {
  const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
  const htmlElement = document.documentElement;
  // Check if theme is saved in localStorage
  const savedTheme = localStorage.getItem("sphinx-rtd-theme");
  if (savedTheme) {
    // Use the saved theme preference
    htmlElement.setAttribute("data-theme", savedTheme);
    document.body.classList.toggle("dark", savedTheme === "dark");
  } else {
    // Fall back to system preference
    const theme = prefersDark ? "dark" : "light";
    htmlElement.setAttribute("data-theme", theme);
    document.body.classList.toggle("dark", theme === "dark");
    // Save initial preference
    localStorage.setItem("sphinx-rtd-theme", theme);
  }
  // Listen for theme changes from the existing toggle
  const observer = new MutationObserver(function(mutations) {
    mutations.forEach(function(mutation) {
      if (mutation.attributeName === "data-theme") {
        const currentTheme = htmlElement.getAttribute("data-theme");
        localStorage.setItem("sphinx-rtd-theme", currentTheme);
      }
    });
  });
  observer.observe(htmlElement, { attributes: true });
 });
--- a/docs/_static/js/horizontal_nav.js
+++ b/docs/_static/js/horizontal_nav.js
@ -1,44 +0,0 @@
 // Horizontal Navigation Bar for Llama Stack Documentation
 document.addEventListener('DOMContentLoaded', function() {
    // Create the horizontal navigation HTML
    const navHTML = `
        <nav class="horizontal-nav">
            <div class="nav-container">
                <a href="/" class="nav-brand">Llama Stack</a>
                <ul class="nav-links">
                    <li><a href="/">Docs</a></li>
                    <li><a href="/references/api_reference/">API Reference</a></li>
                    <li><a href="https://github.com/meta-llama/llama-stack" target="_blank" class="github-link">
                        <svg class="github-icon" viewBox="0 0 16 16" aria-hidden="true">
                            <path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"/>
                        </svg>
                        GitHub
                    </a></li>
                </ul>
            </div>
        </nav>
    `;
    // Insert the navigation at the beginning of the body
    document.body.insertAdjacentHTML('afterbegin', navHTML);
    // Update navigation links based on current page
    updateActiveNav();
 });
 function updateActiveNav() {
    const currentPath = window.location.pathname;
    const navLinks = document.querySelectorAll('.horizontal-nav .nav-links a');
    navLinks.forEach(link => {
        // Remove any existing active classes
        link.classList.remove('active');
        // Add active class based on current path
        if (currentPath === '/' && link.getAttribute('href') === '/') {
            link.classList.add('active');
        } else if (currentPath.includes('/references/api_reference/') && link.getAttribute('href').includes('api_reference')) {
            link.classList.add('active');
        }
    });
 }
--- a/docs/_static/js/keyboard_shortcuts.js
+++ b/docs/_static/js/keyboard_shortcuts.js
@ -1,14 +0,0 @@
 document.addEventListener('keydown', function(event) {
  // command+K or ctrl+K
  if ((event.metaKey || event.ctrlKey) && event.key === 'k') {
    event.preventDefault();
    document.querySelector('.search-input, .search-field, input[name="q"]').focus();
  }
  // forward slash
  if (event.key === '/' &&
      !event.target.matches('input, textarea, select')) {
    event.preventDefault();
    document.querySelector('.search-input, .search-field, input[name="q"]').focus();
  }
 });
--- a/docs/_static/llama-stack-logo.png
+++ b/docs/_static/llama-stack-logo.png
--- a/docs/_static/llama-stack.png
+++ b/docs/_static/llama-stack.png
--- a/docs/conftest.py
+++ b/docs/conftest.py
@ -1,24 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import os
 import time
 def pytest_collection_modifyitems(items):
    for item in items:
        item.name = item.name.replace(' ', '_') 
 def pytest_runtest_teardown(item):
    interval_seconds = os.getenv("LLAMA_STACK_TEST_INTERVAL_SECONDS")
    if interval_seconds:
        time.sleep(float(interval_seconds))
 def pytest_configure(config):
    config.option.tbstyle = "short"
    config.option.disable_warnings = True
--- a/docs/docs/advanced_apis/evaluation.mdx
+++ b/docs/docs/advanced_apis/evaluation.mdx
@ -0,0 +1,163 @@
 # Evaluation
 ## Evaluation Concepts
 The Llama Stack Evaluation flow allows you to run evaluations on your GenAI application datasets or pre-registered benchmarks.
 We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications:
 - `/datasetio` + `/datasets` API
 - `/scoring` + `/scoring_functions` API
 - `/eval` + `/benchmarks` API
 This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
 The Evaluation APIs are associated with a set of Resources. Please visit the Resources section in our [Core Concepts](../concepts/index.mdx) guide for better high-level understanding.
 - **DatasetIO**: defines interface with datasets and data loaders.
  - Associated with `Dataset` resource.
 - **Scoring**: evaluate outputs of the system.
  - Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
 - **Eval**: generate outputs (via Inference or Agents) and perform scoring.
  - Associated with `Benchmark` resource.
 ## Evaluation Providers
 Llama Stack provides multiple evaluation providers:
 - **Meta Reference** (`inline::meta-reference`) - Meta's reference implementation with multi-language support
 - **NVIDIA** (`remote::nvidia`) - NVIDIA's evaluation platform integration
 ### Meta Reference
 Meta's reference implementation of evaluation tasks with support for multiple languages and evaluation metrics.
 #### Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `kvstore` | `RedisKVStoreConfig \| SqliteKVStoreConfig \| PostgresKVStoreConfig \| MongoDBKVStoreConfig` | No | sqlite | Key-value store configuration |
 #### Sample Configuration
 ```yaml
 kvstore:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/meta_reference_eval.db
 ```
 #### Features
 - Multi-language evaluation support
 - Comprehensive evaluation metrics
 - Integration with various key-value stores (SQLite, Redis, PostgreSQL, MongoDB)
 - Built-in support for popular benchmarks
 ### NVIDIA
 NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform.
 #### Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `evaluator_url` | `str` | No | http://0.0.0.0:7331 | The url for accessing the evaluator service |
 #### Sample Configuration
 ```yaml
 evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}
 ```
 #### Features
 - Integration with NVIDIA's evaluation platform
 - Remote evaluation capabilities
 - Scalable evaluation processing
 ## Open-benchmark Eval
 ### List of open-benchmarks Llama Stack support
 Llama stack pre-registers several popular open-benchmarks to easily evaluate model performance via CLI.
 The list of open-benchmarks we currently support:
 - [MMLU-COT](https://arxiv.org/abs/2009.03300) (Measuring Massive Multitask Language Understanding): Benchmark designed to comprehensively evaluate the breadth and depth of a model's academic and professional understanding
 - [GPQA-COT](https://arxiv.org/abs/2311.12022) (A Graduate-Level Google-Proof Q&A Benchmark): A challenging benchmark of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry.
 - [SimpleQA](https://openai.com/index/introducing-simpleqa/): Benchmark designed to access models to answer short, fact-seeking questions.
 - [MMMU](https://arxiv.org/abs/2311.16502) (A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI): Benchmark designed to evaluate multimodal models.
 You can follow this [contributing guide](../references/evals_reference/index.mdx#open-benchmark-contributing-guide) to add more open-benchmarks to Llama Stack
 ### Run evaluation on open-benchmarks via CLI
 We have built-in functionality to run the supported open-benchmarks using llama-stack-client CLI
 #### Spin up Llama Stack server
 Spin up llama stack server with 'open-benchmark' template
 ```
 llama stack run llama_stack/distributions/open-benchmark/run.yaml
 ```
 #### Run eval CLI
 There are 3 necessary inputs to run a benchmark eval
 - `list of benchmark_ids`: The list of benchmark ids to run evaluation on
 - `model-id`: The model id to evaluate on
 - `output_dir`: Path to store the evaluate results
 ```
 llama-stack-client eval run-benchmark <benchmark_id_1> <benchmark_id_2> ... \
 --model_id <model id to evaluate on> \
 --output_dir <directory to store the evaluate results>
 ```
 You can run
 ```
 llama-stack-client eval run-benchmark help
 ```
 to see the description of all the flags that eval run-benchmark has
 In the output log, you can find the file path that has your evaluation results. Open that file and you can see you aggregate evaluation results over there.
 ## Usage Example
 Here's a basic example of using the evaluation API:
 ```python
 from llama_stack_client import LlamaStackClient
 client = LlamaStackClient(base_url="http://localhost:8321")
 # Register a dataset for evaluation
 client.datasets.register(
    purpose="evaluation",
    source={
        "type": "uri",
        "uri": "huggingface://datasets/llamastack/evaluation_dataset"
    },
    dataset_id="my_eval_dataset"
 )
 # Run evaluation
 eval_result = client.eval.run_evaluation(
    dataset_id="my_eval_dataset",
    scoring_functions=["accuracy", "bleu"],
    model_id="my_model"
 )
 print(f"Evaluation completed: {eval_result}")
 ```
 ## Best Practices
 - **Choose appropriate providers**: Use Meta Reference for comprehensive evaluation, NVIDIA for platform-specific needs
 - **Configure storage properly**: Ensure your key-value store configuration matches your performance requirements
 - **Monitor evaluation progress**: Large evaluations can take time - implement proper monitoring
 - **Use appropriate scoring functions**: Select scoring metrics that align with your evaluation goals
 ## What's Next?
 - Check out our Colab notebook on working examples with running benchmark evaluations [here](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb#scrollTo=mxLCsP4MvFqP).
 - Check out our [Building Applications - Evaluation](../building_applications/evals.mdx) guide for more details on how to use the Evaluation APIs to evaluate your applications.
 - Check out our [Evaluation Reference](../references/evals_reference/index.mdx) for more details on the APIs.
 - Explore the [Scoring](./scoring.mdx) documentation for available scoring functions.
--- a/docs/docs/advanced_apis/post_training.mdx
+++ b/docs/docs/advanced_apis/post_training.mdx
@ -0,0 +1,305 @@
 # Post-Training
 Post-training in Llama Stack allows you to fine-tune models using various providers and frameworks. This section covers all available post-training providers and how to use them effectively.
 ## Overview
 Llama Stack provides multiple post-training providers:
 - **HuggingFace SFTTrainer** (`inline::huggingface`) - Fine-tuning using HuggingFace ecosystem
 - **TorchTune** (`inline::torchtune`) - Fine-tuning using Meta's TorchTune framework
 - **NVIDIA** (`remote::nvidia`) - Fine-tuning using NVIDIA's platform
 ## HuggingFace SFTTrainer
 [HuggingFace SFTTrainer](https://huggingface.co/docs/trl/en/sft_trainer) is an inline post training provider for Llama Stack. It allows you to run supervised fine tuning on a variety of models using many datasets.
 ### Features
 - Simple access through the post_training API
 - Fully integrated with Llama Stack
 - GPU support, CPU support, and MPS support (MacOS Metal Performance Shaders)
 ### Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `device` | `str` | No | cuda |  |
 | `distributed_backend` | `Literal['fsdp', 'deepspeed']` | No |  |  |
 | `checkpoint_format` | `Literal['full_state', 'huggingface']` | No | huggingface |  |
 | `chat_template` | `str` | No | |
 | `model_specific_config` | `dict` | No | `{'trust_remote_code': True, 'attn_implementation': 'sdpa'}` |  |
 | `max_seq_length` | `int` | No | 2048 |  |
 | `gradient_checkpointing` | `bool` | No | False |  |
 | `save_total_limit` | `int` | No | 3 |  |
 | `logging_steps` | `int` | No | 10 |  |
 | `warmup_ratio` | `float` | No | 0.1 |  |
 | `weight_decay` | `float` | No | 0.01 |  |
 | `dataloader_num_workers` | `int` | No | 4 |  |
 | `dataloader_pin_memory` | `bool` | No | True |  |
 ### Sample Configuration
 ```yaml
 checkpoint_format: huggingface
 distributed_backend: null
 device: cpu
 ```
 ### Setup
 You can access the HuggingFace trainer via the `starter` distribution:
 ```bash
 llama stack list-deps starter | xargs -L1 uv pip install
 llama stack run starter
 ```
 ### Usage Example
 ```python
 import time
 import uuid
 from llama_stack_client.types import (
    post_training_supervised_fine_tune_params,
    algorithm_config_param,
 )
 def create_http_client():
    from llama_stack_client import LlamaStackClient
    return LlamaStackClient(base_url="http://localhost:8321")
 client = create_http_client()
 # Example Dataset
 client.datasets.register(
    purpose="post-training/messages",
    source={
        "type": "uri",
        "uri": "huggingface://datasets/llamastack/simpleqa?split=train",
    },
    dataset_id="simpleqa",
 )
 training_config = post_training_supervised_fine_tune_params.TrainingConfig(
    data_config=post_training_supervised_fine_tune_params.TrainingConfigDataConfig(
        batch_size=32,
        data_format="instruct",
        dataset_id="simpleqa",
        shuffle=True,
    ),
    gradient_accumulation_steps=1,
    max_steps_per_epoch=0,
    max_validation_steps=1,
    n_epochs=4,
 )
 algorithm_config = algorithm_config_param.LoraFinetuningConfig(
    alpha=1,
    apply_lora_to_mlp=True,
    apply_lora_to_output=False,
    lora_attn_modules=["q_proj"],
    rank=1,
    type="LoRA",
 )
 job_uuid = f"test-job{uuid.uuid4()}"
 # Example Model
 training_model = "ibm-granite/granite-3.3-8b-instruct"
 start_time = time.time()
 response = client.post_training.supervised_fine_tune(
    job_uuid=job_uuid,
    logger_config={},
    model=training_model,
    hyperparam_search_config={},
    training_config=training_config,
    algorithm_config=algorithm_config,
    checkpoint_dir="output",
 )
 print("Job: ", job_uuid)
 # Wait for the job to complete!
 while True:
    status = client.post_training.job.status(job_uuid=job_uuid)
    if not status:
        print("Job not found")
        break
    print(status)
    if status.status == "completed":
        break
    print("Waiting for job to complete...")
    time.sleep(5)
 end_time = time.time()
 print("Job completed in", end_time - start_time, "seconds!")
 print("Artifacts:")
 print(client.post_training.job.artifacts(job_uuid=job_uuid))
 ```
 ## TorchTune
 [TorchTune](https://github.com/pytorch/torchtune) is an inline post training provider for Llama Stack. It provides a simple and efficient way to fine-tune language models using PyTorch.
 ### Features
 - Simple access through the post_training API
 - Fully integrated with Llama Stack
 - GPU support and single device capabilities
 - Support for LoRA
 ### Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `torch_seed` | `int \| None` | No |  |  |
 | `checkpoint_format` | `Literal['meta', 'huggingface']` | No | meta |  |
 ### Sample Configuration
 ```yaml
 checkpoint_format: meta
 ```
 ### Setup
 You can access the TorchTune trainer by writing your own yaml pointing to the provider:
 ```yaml
 post_training:
  - provider_id: torchtune
    provider_type: inline::torchtune
    config: {}
 ```
 You can then build and run your own stack with this provider.
 ### Usage Example
 ```python
 import time
 import uuid
 from llama_stack_client.types import (
    post_training_supervised_fine_tune_params,
    algorithm_config_param,
 )
 def create_http_client():
    from llama_stack_client import LlamaStackClient
    return LlamaStackClient(base_url="http://localhost:8321")
 client = create_http_client()
 # Example Dataset
 client.datasets.register(
    purpose="post-training/messages",
    source={
        "type": "uri",
        "uri": "huggingface://datasets/llamastack/simpleqa?split=train",
    },
    dataset_id="simpleqa",
 )
 training_config = post_training_supervised_fine_tune_params.TrainingConfig(
    data_config=post_training_supervised_fine_tune_params.TrainingConfigDataConfig(
        batch_size=32,
        data_format="instruct",
        dataset_id="simpleqa",
        shuffle=True,
    ),
    gradient_accumulation_steps=1,
    max_steps_per_epoch=0,
    max_validation_steps=1,
    n_epochs=4,
 )
 algorithm_config = algorithm_config_param.LoraFinetuningConfig(
    alpha=1,
    apply_lora_to_mlp=True,
    apply_lora_to_output=False,
    lora_attn_modules=["q_proj"],
    rank=1,
    type="LoRA",
 )
 job_uuid = f"test-job{uuid.uuid4()}"
 # Example Model
 training_model = "meta-llama/Llama-2-7b-hf"
 start_time = time.time()
 response = client.post_training.supervised_fine_tune(
    job_uuid=job_uuid,
    logger_config={},
    model=training_model,
    hyperparam_search_config={},
    training_config=training_config,
    algorithm_config=algorithm_config,
    checkpoint_dir="output",
 )
 print("Job: ", job_uuid)
 # Wait for the job to complete!
 while True:
    status = client.post_training.job.status(job_uuid=job_uuid)
    if not status:
        print("Job not found")
        break
    print(status)
    if status.status == "completed":
        break
    print("Waiting for job to complete...")
    time.sleep(5)
 end_time = time.time()
 print("Job completed in", end_time - start_time, "seconds!")
 print("Artifacts:")
 print(client.post_training.job.artifacts(job_uuid=job_uuid))
 ```
 ## NVIDIA
 NVIDIA's post-training provider for fine-tuning models on NVIDIA's platform.
 ### Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `api_key` | `str \| None` | No |  | The NVIDIA API key. |
 | `dataset_namespace` | `str \| None` | No | default | The NVIDIA dataset namespace. |
 | `project_id` | `str \| None` | No | test-example-model@v1 | The NVIDIA project ID. |
 | `customizer_url` | `str \| None` | No |  | Base URL for the NeMo Customizer API |
 | `timeout` | `int` | No | 300 | Timeout for the NVIDIA Post Training API |
 | `max_retries` | `int` | No | 3 | Maximum number of retries for the NVIDIA Post Training API |
 | `output_model_dir` | `str` | No | test-example-model@v1 | Directory to save the output model |
 ### Sample Configuration
 ```yaml
 api_key: ${env.NVIDIA_API_KEY:=}
 dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
 project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
 customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:=http://nemo.test}
 ```
 ## Best Practices
 - **Choose the right provider**: Use HuggingFace for broader compatibility, TorchTune for Meta models, or NVIDIA for their ecosystem
 - **Configure hardware appropriately**: Ensure your configuration matches your available hardware (CPU, GPU, MPS)
 - **Monitor jobs**: Always monitor job status and handle completion appropriately
 - **Use appropriate datasets**: Ensure your dataset format matches the expected input format for your chosen provider
 ## Next Steps
 - Check out the [Building Applications - Fine-tuning](../building_applications/index.mdx) guide for application-level examples
 - See the [Providers](../providers/post_training/index.mdx) section for detailed provider documentation
 - Review the [API Reference](../advanced_apis/post_training.mdx) for complete API documentation
--- a/docs/docs/advanced_apis/scoring.mdx
+++ b/docs/docs/advanced_apis/scoring.mdx
@ -0,0 +1,193 @@
 # Scoring
 The Scoring API in Llama Stack allows you to evaluate outputs of your GenAI system using various scoring functions and metrics. This section covers all available scoring providers and their configuration.
 ## Overview
 Llama Stack provides multiple scoring providers:
 - **Basic** (`inline::basic`) - Simple evaluation metrics and scoring functions
 - **Braintrust** (`inline::braintrust`) - Advanced evaluation using the Braintrust platform
 - **LLM-as-Judge** (`inline::llm-as-judge`) - Uses language models to evaluate responses
 The Scoring API is associated with `ScoringFunction` resources and provides a suite of out-of-the-box scoring functions. You can also add custom evaluators to meet specific evaluation needs.
 ## Basic Scoring
 Basic scoring provider for simple evaluation metrics and scoring functions. This provider offers fundamental scoring capabilities without external dependencies.
 ### Configuration
 No configuration required - this provider works out of the box.
 ```yaml
 {}
 ```
 ### Features
 - Simple evaluation metrics (accuracy, precision, recall, F1-score)
 - String matching and similarity metrics
 - Basic statistical scoring functions
 - No external dependencies required
 - Fast execution for standard metrics
 ### Use Cases
 - Quick evaluation of basic accuracy metrics
 - String similarity comparisons
 - Statistical analysis of model outputs
 - Development and testing scenarios
 ## Braintrust
 Braintrust scoring provider for evaluation and scoring using the [Braintrust platform](https://braintrustdata.com/). Braintrust provides advanced evaluation capabilities and experiment tracking.
 ### Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `openai_api_key` | `str \| None` | No |  | The OpenAI API Key for LLM-powered evaluations |
 ### Sample Configuration
 ```yaml
 openai_api_key: ${env.OPENAI_API_KEY:=}
 ```
 ### Features
 - Advanced evaluation metrics
 - Experiment tracking and comparison
 - LLM-powered evaluation functions
 - Integration with Braintrust's evaluation suite
 - Detailed scoring analytics and insights
 ### Use Cases
 - Production evaluation pipelines
 - A/B testing of model versions
 - Advanced scoring with custom metrics
 - Detailed evaluation reporting and analysis
 ## LLM-as-Judge
 LLM-as-judge scoring provider that uses language models to evaluate and score responses. This approach leverages the reasoning capabilities of large language models to assess quality, relevance, and other subjective metrics.
 ### Configuration
 No configuration required - this provider works out of the box.
 ```yaml
 {}
 ```
 ### Features
 - Subjective quality evaluation using LLMs
 - Flexible evaluation criteria definition
 - Natural language evaluation explanations
 - Support for complex evaluation scenarios
 - Contextual understanding of responses
 ### Use Cases
 - Evaluating response quality and relevance
 - Assessing creativity and coherence
 - Subjective metric evaluation
 - Human-like judgment for complex tasks
 ## Usage Examples
 ### Basic Scoring Example
 ```python
 from llama_stack_client import LlamaStackClient
 client = LlamaStackClient(base_url="http://localhost:8321")
 # Register a basic accuracy scoring function
 client.scoring_functions.register(
    scoring_function_id="basic_accuracy",
    provider_id="basic",
    provider_scoring_function_id="accuracy"
 )
 # Use the scoring function
 result = client.scoring.score(
    input_rows=[
        {"expected": "Paris", "actual": "Paris"},
        {"expected": "London", "actual": "Paris"}
    ],
    scoring_function_id="basic_accuracy"
 )
 print(f"Accuracy: {result.results[0].score}")
 ```
 ### LLM-as-Judge Example
 ```python
 # Register an LLM-as-judge scoring function
 client.scoring_functions.register(
    scoring_function_id="quality_judge",
    provider_id="llm_judge",
    provider_scoring_function_id="response_quality",
    params={
        "criteria": "Evaluate response quality, relevance, and helpfulness",
        "scale": "1-10"
    }
 )
 # Score responses using LLM judgment
 result = client.scoring.score(
    input_rows=[{
        "query": "What is machine learning?",
        "response": "Machine learning is a subset of AI that enables computers to learn patterns from data..."
    }],
    scoring_function_id="quality_judge"
 )
 ```
 ### Braintrust Integration Example
 ```python
 # Register a Braintrust scoring function
 client.scoring_functions.register(
    scoring_function_id="braintrust_eval",
    provider_id="braintrust",
    provider_scoring_function_id="semantic_similarity"
 )
 # Run evaluation with Braintrust
 result = client.scoring.score(
    input_rows=[{
        "reference": "The capital of France is Paris",
        "candidate": "Paris is the capital city of France"
    }],
    scoring_function_id="braintrust_eval"
 )
 ```
 ## Best Practices
 - **Choose appropriate providers**: Use Basic for simple metrics, Braintrust for advanced analytics, LLM-as-Judge for subjective evaluation
 - **Define clear criteria**: When using LLM-as-Judge, provide specific evaluation criteria and scales
 - **Validate scoring functions**: Test your scoring functions with known examples before production use
 - **Monitor performance**: Track scoring performance and adjust thresholds based on results
 - **Combine multiple metrics**: Use different scoring providers together for comprehensive evaluation
 ## Integration with Evaluation
 The Scoring API works closely with the [Evaluation](./evaluation.mdx) API to provide comprehensive evaluation workflows:
 1. **Datasets** are loaded via the DatasetIO API
 2. **Evaluation** generates model outputs using the Eval API
 3. **Scoring** evaluates the quality of outputs using various scoring functions
 4. **Results** are aggregated and reported for analysis
 ## Next Steps
 - Check out the [Evaluation](./evaluation.mdx) guide for running complete evaluations
 - See the [Building Applications - Evaluation](../building_applications/evals.mdx) guide for application examples
 - Review the [Evaluation Reference](../references/evals_reference/) for comprehensive scoring function usage
 - Explore the [Evaluation Concepts](../concepts/evaluation_concepts) for detailed conceptual information
--- a/docs/docs/api-overview.md
+++ b/docs/docs/api-overview.md
@ -0,0 +1,49 @@
 # API Reference Overview
 The Llama Stack provides a comprehensive set of APIs organized by stability level to help you choose the right endpoints for your use case.
 ## 🟢 Stable APIs
 **Production-ready APIs with backward compatibility guarantees.**
 These APIs are fully tested, documented, and stable. They follow semantic versioning principles and maintain backward compatibility within major versions. Recommended for production applications.
 [**Browse Stable APIs →**](./api/llama-stack-specification)
 **Key Features:**
 - ✅ Backward compatibility guaranteed
 - ✅ Comprehensive testing and validation
 - ✅ Production-ready reliability
 - ✅ Long-term support
 ---
 ## 🟡 Experimental APIs
 **Preview APIs that may change before becoming stable.**
 These APIs include v1alpha and v1beta endpoints that are feature-complete but may undergo changes based on feedback. Great for exploring new capabilities and providing feedback.
 [**Browse Experimental APIs →**](./api-experimental/llama-stack-specification-experimental-apis)
 **Key Features:**
 - 🧪 Latest features and capabilities
 - 🧪 May change based on user feedback
 - 🧪 Active development and iteration
 - 🧪 Opportunity to influence final design
 ---
 ## 🔴 Deprecated APIs
 **Legacy APIs for migration reference.**
 These APIs are deprecated and will be removed in future versions. They are provided for migration purposes and to help transition to newer, stable alternatives.
 [**Browse Deprecated APIs →**](./api-deprecated/llama-stack-specification-deprecated-apis)
 **Key Features:**
 - ⚠️ Will be removed in future versions
 - ⚠️ Migration guidance provided
 - ⚠️ Use for compatibility during transition
 - ⚠️ Not recommended for new projects
--- a/docs/source/building_applications/agent.md
+++ b/docs/source/building_applications/agent.md
@ -1,9 +1,18 @@
 ---
 title: Agents
 description: Build powerful AI applications with the Llama Stack agent framework
 sidebar_label: Agents
 sidebar_position: 3
 ---
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Agents
 An Agent in Llama Stack is a powerful abstraction that allows you to build complex AI applications.
-The Llama Stack agent framework is built on a modular architecture that allows for flexible and powerful AI
+The Llama Stack agent framework is built on a modular architecture that allows for flexible and powerful AI applications. This document explains the key components and how they work together.
 applications. This document explains the key components and how they work together.
 ## Core Concepts
@ -19,7 +28,6 @@ Agents are configured using the `AgentConfig` class, which includes:
 ```python
 from llama_stack_client import Agent
 # Create the agent
 agent = Agent(
    llama_stack_client,
@ -46,6 +54,9 @@ Each interaction with an agent is called a "turn" and consists of:
 - **Steps**: The agent's internal processing (inference, tool execution, etc.)
 - **Output Message**: The agent's response
 <Tabs>
 <TabItem value="streaming" label="Streaming Response">
 ```python
 from llama_stack_client import AgentEventLogger
@ -57,9 +68,9 @@ turn_response = agent.create_turn(
 for log in AgentEventLogger().log(turn_response):
    log.print()
 ```
 ###  Non-Streaming
 </TabItem>
 <TabItem value="non-streaming" label="Non-Streaming Response">
 ```python
 from rich.pretty import pprint
@ -78,6 +89,9 @@ print("Steps:")
 pprint(response.steps)
 ```
 </TabItem>
 </Tabs>
 ### 4. Steps
 Each turn consists of multiple steps that represent the agent's thought process:
@ -88,5 +102,11 @@ Each turn consists of multiple steps that represent the agent's thought process:
 ## Agent Execution Loop
 Refer to the [Agent Execution Loop](./agent_execution_loop) for more details on what happens within an agent turn.
-Refer to the [Agent Execution Loop](agent_execution_loop) for more details on what happens within an agent turn.
+## Related Resources
 - **[Agent Execution Loop](./agent_execution_loop)** - Understanding the internal processing flow
 - **[RAG (Retrieval Augmented Generation)](./rag)** - Building knowledge-enhanced agents
 - **[Tools Integration](./tools)** - Extending agent capabilities with external tools
 - **[Safety Guardrails](./safety)** - Implementing responsible AI practices
--- a/docs/source/building_applications/agent_execution_loop.md
+++ b/docs/source/building_applications/agent_execution_loop.md
@ -1,10 +1,18 @@
-## Agent Execution Loop
+---
 title: Agent Execution Loop
 description: Understanding the internal processing flow of Llama Stack agents
 sidebar_label: Agent Execution Loop
 sidebar_position: 4
 ---
-Agents are the heart of Llama Stack applications. They combine inference, memory, safety, and tool usage into coherent
+import Tabs from '@theme/Tabs';
-workflows. At its core, an agent follows a sophisticated execution loop that enables multi-step reasoning, tool usage,
+import TabItem from '@theme/TabItem';
 and safety checks.
-### Steps in the Agent Workflow
+# Agent Execution Loop
 Agents are the heart of Llama Stack applications. They combine inference, memory, safety, and tool usage into coherent workflows. At its core, an agent follows a sophisticated execution loop that enables multi-step reasoning, tool usage, and safety checks.
 ## Steps in the Agent Workflow
 Each agent turn follows these key steps:
@ -17,7 +25,7 @@ Each agent turn follows these key steps:
 3. **Inference Loop**: The agent enters its main execution loop:
   - The LLM receives a user prompt (with previous tool outputs)
-   - The LLM generates a response, potentially with [tool calls](tools)
+   - The LLM generates a response, potentially with [tool calls](./tools)
   - If tool calls are present:
     - Tool inputs are safety-checked
     - Tools are executed (e.g., web search, code execution)
@ -29,7 +37,9 @@ Each agent turn follows these key steps:
 4. **Final Safety Check**: The agent's final response is screened through safety shields
-```{mermaid}
+## Execution Flow Diagram
 ```mermaid
 sequenceDiagram
    participant U as User
    participant E as Executor
@ -70,12 +80,15 @@ sequenceDiagram
 Each step in this process can be monitored and controlled through configurations.
-### Agent Execution Loop Example
+## Agent Execution Example
 Here's an example that demonstrates monitoring the agent's execution:
 <Tabs>
 <TabItem value="streaming" label="Streaming Execution">
 ```python
 from llama_stack_client import LlamaStackClient, Agent, AgentEventLogger
 from rich.pretty import pprint
 # Replace host and port
 client = LlamaStackClient(base_url=f"http://{HOST}:{PORT}")
@ -120,6 +133,13 @@ response = agent.create_turn(
 # Monitor each step of execution
 for log in AgentEventLogger().log(response):
    log.print()
 ```
 </TabItem>
 <TabItem value="non-streaming" label="Non-Streaming Execution">
 ```python
 from rich.pretty import pprint
 # Using non-streaming API, the response contains input, steps, and output.
 response = agent.create_turn(
@ -131,9 +151,35 @@ response = agent.create_turn(
        }
    ],
    session_id=session_id,
    stream=False,
 )
 pprint(f"Input: {response.input_messages}")
 pprint(f"Output: {response.output_message.content}")
 pprint(f"Steps: {response.steps}")
 ```
 </TabItem>
 </Tabs>
 ## Key Configuration Options
 ### Loop Control
 - **max_infer_iters**: Maximum number of inference iterations (default: 5)
 - **max_tokens**: Token limit for responses
 - **temperature**: Controls response randomness
 ### Safety Configuration
 - **input_shields**: Safety checks for user input
 - **output_shields**: Safety checks for agent responses
 ### Tool Integration
 - **tools**: List of available tools for the agent
 - **tool_choice**: Control over when tools are used
 ## Related Resources
 - **[Agents](./agent)** - Understanding agent fundamentals
 - **[Tools Integration](./tools)** - Adding capabilities to agents
 - **[Safety Guardrails](./safety)** - Implementing safety measures
 - **[RAG (Retrieval Augmented Generation)](./rag)** - Building knowledge-enhanced workflows
--- a/docs/docs/building_applications/evals.mdx
+++ b/docs/docs/building_applications/evals.mdx
@ -0,0 +1,256 @@
 ---
 title: Evaluations
 description: Evaluate LLM applications with Llama Stack's comprehensive evaluation framework
 sidebar_label: Evaluations
 sidebar_position: 7
 ---
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 This guide walks you through the process of evaluating an LLM application built using Llama Stack. For detailed API reference, check out the [Evaluation Reference](../references/evals_reference/) guide that covers the complete set of APIs and developer experience flow.
 :::tip[Interactive Examples]
 Check out our [Colab notebook](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing) for working examples with evaluations, or try the [Getting Started notebook](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).
 :::
 ## Application Evaluation Example
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)
 Llama Stack offers a library of scoring functions and the `/scoring` API, allowing you to run evaluations on your pre-annotated AI application datasets.
 In this example, we will show you how to:
 1. **Build an Agent** with Llama Stack
 2. **Query the agent's sessions, turns, and steps** to analyze execution
 3. **Evaluate the results** using scoring functions
 ## Step-by-Step Evaluation Process
 ### 1. Building a Search Agent
 First, let's create an agent that can search the web to answer questions:
 ```python
 from llama_stack_client import LlamaStackClient, Agent, AgentEventLogger
 client = LlamaStackClient(base_url=f"http://{HOST}:{PORT}")
 agent = Agent(
    client,
    model="meta-llama/Llama-3.3-70B-Instruct",
    instructions="You are a helpful assistant. Use search tool to answer the questions.",
    tools=["builtin::websearch"],
 )
 # Test prompts for evaluation
 user_prompts = [
    "Which teams played in the NBA Western Conference Finals of 2024. Search the web for the answer.",
    "In which episode and season of South Park does Bill Cosby (BSM-471) first appear? Give me the number and title. Search the web for the answer.",
    "What is the British-American kickboxer Andrew Tate's kickboxing name? Search the web for the answer.",
 ]
 session_id = agent.create_session("test-session")
 # Execute all prompts in the session
 for prompt in user_prompts:
    response = agent.create_turn(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        session_id=session_id,
    )
    for log in AgentEventLogger().log(response):
        log.print()
 ```
 ### 2. Query Agent Execution Steps
 Now, let's analyze the agent's execution steps to understand its performance:
 <Tabs>
 <TabItem value="session-analysis" label="Session Analysis">
 ```python
 from rich.pretty import pprint
 # Query the agent's session to get detailed execution data
 session_response = client.agents.session.retrieve(
    session_id=session_id,
    agent_id=agent.agent_id,
 )
 pprint(session_response)
 ```
 </TabItem>
 <TabItem value="tool-validation" label="Tool Usage Validation">
 ```python
 # Sanity check: Verify that all user prompts are followed by tool calls
 num_tool_call = 0
 for turn in session_response.turns:
    for step in turn.steps:
        if (
            step.step_type == "tool_execution"
            and step.tool_calls[0].tool_name == "brave_search"
        ):
            num_tool_call += 1
 print(
    f"{num_tool_call}/{len(session_response.turns)} user prompts are followed by a tool call to `brave_search`"
 )
 ```
 </TabItem>
 </Tabs>
 ### 3. Evaluate Agent Responses
 Now we'll evaluate the agent's responses using Llama Stack's scoring API:
 <Tabs>
 <TabItem value="data-preparation" label="Data Preparation">
 ```python
 # Process agent execution history into evaluation rows
 eval_rows = []
 # Define expected answers for our test prompts
 expected_answers = [
    "Dallas Mavericks and the Minnesota Timberwolves",
    "Season 4, Episode 12",
    "King Cobra",
 ]
 # Create evaluation dataset from agent responses
 for i, turn in enumerate(session_response.turns):
    eval_rows.append(
        {
            "input_query": turn.input_messages[0].content,
            "generated_answer": turn.output_message.content,
            "expected_answer": expected_answers[i],
        }
    )
 pprint(eval_rows)
 ```
 </TabItem>
 <TabItem value="scoring" label="Scoring & Evaluation">
 ```python
 # Configure scoring parameters
 scoring_params = {
    "basic::subset_of": None,  # Check if generated answer contains expected answer
 }
 # Run evaluation using Llama Stack's scoring API
 scoring_response = client.scoring.score(
    input_rows=eval_rows,
    scoring_functions=scoring_params
 )
 pprint(scoring_response)
 # Analyze results
 for i, result in enumerate(scoring_response.results):
    print(f"Query {i+1}: {result.score}")
    print(f"  Generated: {eval_rows[i]['generated_answer'][:100]}...")
    print(f"  Expected: {expected_answers[i]}")
    print(f"  Score: {result.score}")
    print()
 ```
 </TabItem>
 </Tabs>
 ## Available Scoring Functions
 Llama Stack provides several built-in scoring functions:
 ### Basic Scoring Functions
 - **`basic::subset_of`**: Checks if the expected answer is contained in the generated response
 - **`basic::exact_match`**: Performs exact string matching between expected and generated answers
 - **`basic::regex_match`**: Uses regular expressions to match patterns in responses
 ### Advanced Scoring Functions
 - **`llm_as_judge::accuracy`**: Uses an LLM to judge response accuracy
 - **`llm_as_judge::helpfulness`**: Evaluates how helpful the response is
 - **`llm_as_judge::safety`**: Assesses response safety and appropriateness
 ### Custom Scoring Functions
 You can also create custom scoring functions for domain-specific evaluation needs.
 ## Evaluation Workflow Best Practices
 ### 🎯 **Dataset Preparation**
 - Use diverse test cases that cover edge cases and common scenarios
 - Include clear expected answers or success criteria
 - Balance your dataset across different difficulty levels
 ### 📊 **Metrics Selection**
 - Choose appropriate scoring functions for your use case
 - Combine multiple metrics for comprehensive evaluation
 - Consider both automated and human evaluation metrics
 ### 🔄 **Iterative Improvement**
 - Run evaluations regularly during development
 - Use evaluation results to identify areas for improvement
 - Track performance changes over time
 ### 📈 **Analysis & Reporting**
 - Analyze failures to understand model limitations
 - Generate comprehensive evaluation reports
 - Share results with stakeholders for informed decision-making
 ## Advanced Evaluation Scenarios
 ### Batch Evaluation
 For evaluating large datasets efficiently:
 ```python
 # Prepare large evaluation dataset
 large_eval_dataset = [
    {"input_query": query, "expected_answer": answer}
    for query, answer in zip(queries, expected_answers)
 ]
 # Run batch evaluation
 batch_results = client.scoring.score(
    input_rows=large_eval_dataset,
    scoring_functions={
        "basic::subset_of": None,
        "llm_as_judge::accuracy": {"judge_model": "meta-llama/Llama-3.3-70B-Instruct"},
    }
 )
 ```
 ### Multi-Metric Evaluation
 Combining different scoring approaches:
 ```python
 comprehensive_scoring = {
    "exact_match": "basic::exact_match",
    "subset_match": "basic::subset_of",
    "llm_judge": "llm_as_judge::accuracy",
    "safety_check": "llm_as_judge::safety",
 }
 results = client.scoring.score(
    input_rows=eval_rows,
    scoring_functions=comprehensive_scoring
 )
 ```
 ## Related Resources
 - **[Agents](./agent)** - Building agents for evaluation
 - **[Tools Integration](./tools)** - Using tools in evaluated agents
 - **[Evaluation Reference](../references/evals_reference/)** - Complete API reference for evaluations
 - **[Getting Started Notebook](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)** - Interactive examples
 - **[Evaluation Examples](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing)** - Additional evaluation scenarios
--- a/docs/docs/building_applications/index.mdx
+++ b/docs/docs/building_applications/index.mdx
@ -0,0 +1,83 @@
 ---
 title: Building Applications
 description: Comprehensive guides for building AI applications with Llama Stack
 sidebar_label: Overview
 sidebar_position: 5
 ---
 # AI Application Examples
 Llama Stack provides all the building blocks needed to create sophisticated AI applications.
 ## Getting Started
 The best way to get started is to look at this comprehensive notebook which walks through the various APIs (from basic inference, to RAG agents) and how to use them.
 **📓 [Building AI Applications Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)**
 ## Core Topics
 Here are the key topics that will help you build effective AI applications:
 ### 🤖 **Agent Development**
 - **[Agent Framework](./agent.mdx)** - Understand the components and design patterns of the Llama Stack agent framework
 - **[Agent Execution Loop](./agent_execution_loop.mdx)** - How agents process information, make decisions, and execute actions
 - **[Agents vs Responses API](./responses_vs_agents.mdx)** - Learn when to use each API for different use cases
 ### 📚 **Knowledge Integration**
 - **[RAG (Retrieval-Augmented Generation)](./rag.mdx)** - Enhance your agents with external knowledge through retrieval mechanisms
 ### 🛠️ **Capabilities & Extensions**
 - **[Tools](./tools.mdx)** - Extend your agents' capabilities by integrating with external tools and APIs
 ### 📊 **Quality & Monitoring**
 - **[Evaluations](./evals.mdx)** - Evaluate your agents' effectiveness and identify areas for improvement
 - **[Telemetry](./telemetry.mdx)** - Monitor and analyze your agents' performance and behavior
 - **[Safety](./safety.mdx)** - Implement guardrails and safety measures to ensure responsible AI behavior
 ### 🎮 **Interactive Development**
 - **[Playground](./playground.mdx)** - Interactive environment for testing and developing applications
 ## Application Patterns
 ### 🤖 **Conversational Agents**
 Build intelligent chatbots and assistants that can:
 - Maintain context across conversations
 - Access external knowledge bases
 - Execute actions through tool integrations
 - Apply safety filters and guardrails
 ### 📖 **RAG Applications**
 Create knowledge-augmented applications that:
 - Retrieve relevant information from documents
 - Generate contextually accurate responses
 - Handle large knowledge bases efficiently
 - Provide source attribution
 ### 🔧 **Tool-Enhanced Systems**
 Develop applications that can:
 - Search the web for real-time information
 - Interact with databases and APIs
 - Perform calculations and analysis
 - Execute complex multi-step workflows
 ### 🛡️ **Enterprise Applications**
 Build production-ready systems with:
 - Comprehensive safety measures
 - Performance monitoring and analytics
 - Scalable deployment configurations
 - Evaluation and quality assurance
 ## Next Steps
 1. **📖 Start with the Notebook** - Work through the complete tutorial
 2. **🎯 Choose Your Pattern** - Pick the application type that matches your needs
 3. **🏗️ Build Your Foundation** - Set up your [providers](/docs/providers/) and [distributions](/docs/distributions/)
 4. **🚀 Deploy & Monitor** - Use our [deployment guides](/docs/deploying/) for production
 ## Related Resources
 - **[Getting Started](/docs/getting_started/quickstart)** - Basic setup and concepts
 - **[Providers](/docs/providers/)** - Available AI service providers
 - **[Distributions](/docs/distributions/)** - Pre-configured deployment packages
 - **[API Reference](/docs/api/llama-stack-specification)** - Complete API documentation
--- a/docs/docs/building_applications/playground.mdx
+++ b/docs/docs/building_applications/playground.mdx
@ -0,0 +1,298 @@
 ---
 title: Llama Stack Playground
 description: Interactive interface to explore and experiment with Llama Stack capabilities
 sidebar_label: Playground
 sidebar_position: 10
 ---
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Llama Stack Playground
 :::note[Experimental Feature]
 The Llama Stack Playground is currently experimental and subject to change. We welcome feedback and contributions to help improve it.
 :::
 The Llama Stack Playground is a simple interface that aims to:
 - **Showcase capabilities and concepts** of Llama Stack in an interactive environment
 - **Demo end-to-end application code** to help users get started building their own applications
 - **Provide a UI** to help users inspect and understand Llama Stack API providers and resources
 ## Key Features
 ### Interactive Playground Pages
 The playground provides interactive pages for users to explore Llama Stack API capabilities:
 #### Chatbot Interface
 <video
  controls
  autoPlay
  playsInline
  muted
  loop
  style={{width: '100%'}}
 >
  <source src="https://github.com/user-attachments/assets/8d2ef802-5812-4a28-96e1-316038c84cbf" type="video/mp4" />
  Your browser does not support the video tag.
 </video>
 <Tabs>
 <TabItem value="chat" label="Chat">
 **Simple Chat Interface**
 - Chat directly with Llama models through an intuitive interface
 - Uses the `/chat/completions` streaming API under the hood
 - Real-time message streaming for responsive interactions
 - Perfect for testing model capabilities and prompt engineering
 </TabItem>
 <TabItem value="rag" label="RAG Chat">
 **Document-Aware Conversations**
 - Upload documents to create memory banks
 - Chat with a RAG-enabled agent that can query your documents
 - Uses Llama Stack's `/agents` API to create and manage RAG sessions
 - Ideal for exploring knowledge-enhanced AI applications
 </TabItem>
 </Tabs>
 #### Evaluation Interface
 <video
  controls
  autoPlay
  playsInline
  muted
  loop
  style={{width: '100%'}}
 >
  <source src="https://github.com/user-attachments/assets/6cc1659f-eba4-49ca-a0a5-7c243557b4f5" type="video/mp4" />
  Your browser does not support the video tag.
 </video>
 <Tabs>
 <TabItem value="scoring" label="Scoring Evaluations">
 **Custom Dataset Evaluation**
 - Upload your own evaluation datasets
 - Run evaluations using available scoring functions
 - Uses Llama Stack's `/scoring` API for flexible evaluation workflows
 - Great for testing application performance on custom metrics
 </TabItem>
 <TabItem value="benchmarks" label="Benchmark Evaluations">
 <video
  controls
  autoPlay
  playsInline
  muted
  loop
  style={{width: '100%', marginBottom: '1rem'}}
 >
  <source src="https://github.com/user-attachments/assets/345845c7-2a2b-4095-960a-9ae40f6a93cf" type="video/mp4" />
  Your browser does not support the video tag.
 </video>
 **Pre-registered Evaluation Tasks**
 - Evaluate models or agents on pre-defined tasks
 - Uses Llama Stack's `/eval` API for comprehensive evaluation
 - Combines datasets and scoring functions for standardized testing
 **Setup Requirements:**
 Register evaluation datasets and benchmarks first:
 ```bash
 # Register evaluation dataset
 llama-stack-client datasets register \
  --dataset-id "mmlu" \
  --provider-id "huggingface" \
  --url "https://huggingface.co/datasets/llamastack/evals" \
  --metadata '{"path": "llamastack/evals", "name": "evals__mmlu__details", "split": "train"}' \
  --schema '{"input_query": {"type": "string"}, "expected_answer": {"type": "string"}, "chat_completion_input": {"type": "string"}}'
 # Register benchmark task
 llama-stack-client benchmarks register \
  --eval-task-id meta-reference-mmlu \
  --provider-id meta-reference \
  --dataset-id mmlu \
  --scoring-functions basic::regex_parser_multiple_choice_answer
 ```
 </TabItem>
 </Tabs>
 #### Inspection Interface
 <video
  controls
  autoPlay
  playsInline
  muted
  loop
  style={{width: '100%'}}
 >
  <source src="https://github.com/user-attachments/assets/01d52b2d-92af-4e3a-b623-a9b8ba22ba99" type="video/mp4" />
  Your browser does not support the video tag.
 </video>
 <Tabs>
 <TabItem value="providers" label="API Providers">
 **Provider Management**
 - Inspect available Llama Stack API providers
 - View provider configurations and capabilities
 - Uses the `/providers` API for real-time provider information
 - Essential for understanding your deployment's capabilities
 </TabItem>
 <TabItem value="resources" label="API Resources">
 **Resource Exploration**
 - Inspect Llama Stack API resources including:
  - **Models**: Available language models
  - **Datasets**: Registered evaluation datasets
  - **Memory Banks**: Vector databases and knowledge stores
  - **Benchmarks**: Evaluation tasks and scoring functions
  - **Shields**: Safety and content moderation tools
 - Uses `/<resources>/list` APIs for comprehensive resource visibility
 - For detailed information about resources, see [Core Concepts](/docs/concepts)
 </TabItem>
 </Tabs>
 ## Getting Started
 ### Quick Start Guide
 <Tabs>
 <TabItem value="setup" label="Setup">
 **1. Start the Llama Stack API Server**
 ```bash
 llama stack list-deps together | xargs -L1 uv pip install
 llama stack run together
 ```
 **2. Start the Streamlit UI**
 ```bash
 # Launch the playground interface
 uv run --with ".[ui]" streamlit run llama_stack.core/ui/app.py
 ```
 </TabItem>
 <TabItem value="usage" label="Usage Tips">
 **Making the Most of the Playground:**
 - **Start with Chat**: Test basic model interactions and prompt engineering
 - **Explore RAG**: Upload sample documents to see knowledge-enhanced responses
 - **Try Evaluations**: Use the scoring interface to understand evaluation metrics
 - **Inspect Resources**: Check what providers and resources are available
 - **Experiment with Settings**: Adjust parameters to see how they affect results
 </TabItem>
 </Tabs>
 ### Available Distributions
 The playground works with any Llama Stack distribution. Popular options include:
 <Tabs>
 <TabItem value="together" label="Together AI">
 ```bash
 llama stack list-deps together | xargs -L1 uv pip install
 llama stack run together
 ```
 **Features:**
 - Cloud-hosted models
 - Fast inference
 - Multiple model options
 </TabItem>
 <TabItem value="ollama" label="Ollama (Local)">
 ```bash
 llama stack list-deps ollama | xargs -L1 uv pip install
 llama stack run ollama
 ```
 **Features:**
 - Local model execution
 - Privacy-focused
 - No internet required
 </TabItem>
 <TabItem value="meta-reference" label="Meta Reference">
 ```bash
 llama stack list-deps meta-reference | xargs -L1 uv pip install
 llama stack run meta-reference
 ```
 **Features:**
 - Reference implementation
 - All API features available
 - Best for development
 </TabItem>
 </Tabs>
 ## Use Cases & Examples
 ### Educational Use Cases
 - **Learning Llama Stack**: Hands-on exploration of API capabilities
 - **Prompt Engineering**: Interactive testing of different prompting strategies
 - **RAG Experimentation**: Understanding how document retrieval affects responses
 - **Evaluation Understanding**: See how different metrics evaluate model performance
 ### Development Use Cases
 - **Prototype Testing**: Quick validation of application concepts
 - **API Exploration**: Understanding available endpoints and parameters
 - **Integration Planning**: Seeing how different components work together
 - **Demo Creation**: Showcasing Llama Stack capabilities to stakeholders
 ### Research Use Cases
 - **Model Comparison**: Side-by-side testing of different models
 - **Evaluation Design**: Understanding how scoring functions work
 - **Safety Testing**: Exploring shield effectiveness with different inputs
 - **Performance Analysis**: Measuring model behavior across different scenarios
 ## Best Practices
 ### 🚀 **Getting Started**
 - Begin with simple chat interactions to understand basic functionality
 - Gradually explore more advanced features like RAG and evaluations
 - Use the inspection tools to understand your deployment's capabilities
 ### 🔧 **Development Workflow**
 - Use the playground to prototype before writing application code
 - Test different parameter settings interactively
 - Validate evaluation approaches before implementing them programmatically
 ### 📊 **Evaluation & Testing**
 - Start with simple scoring functions before trying complex evaluations
 - Use the playground to understand evaluation results before automation
 - Test safety features with various input types
 ### 🎯 **Production Preparation**
 - Use playground insights to inform your production API usage
 - Test edge cases and error conditions interactively
 - Validate resource configurations before deployment
 ## Related Resources
 - **[Getting Started Guide](../getting_started/quickstart)** - Complete setup and introduction
 - **[Core Concepts](/docs/concepts)** - Understanding Llama Stack fundamentals
 - **[Agents](./agent)** - Building intelligent agents
 - **[RAG (Retrieval Augmented Generation)](./rag)** - Knowledge-enhanced applications
 - **[Evaluations](./evals)** - Comprehensive evaluation framework
 - **[API Reference](/docs/api/llama-stack-specification)** - Complete API documentation
--- a/docs/docs/building_applications/rag.mdx
+++ b/docs/docs/building_applications/rag.mdx
@ -0,0 +1,123 @@
 ---
 title: Retrieval Augmented Generation (RAG)
 description: Build knowledge-enhanced AI applications with external document retrieval
 sidebar_label: RAG (Retrieval Augmented Generation)
 sidebar_position: 2
 ---
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Retrieval Augmented Generation (RAG)
 RAG enables your applications to reference and recall information from external documents. Llama Stack makes Agentic RAG available through OpenAI's Responses API.
 ## Quick Start
 ### 1. Start the Server
 In one terminal, start the Llama Stack server:
 ```bash
 llama stack list-deps starter | xargs -L1 uv pip install
 llama stack run starter
 ```
 ### 2. Connect with OpenAI Client
 In another terminal, use the standard OpenAI client with the Responses API:
 ```python
 import io, requests
 from openai import OpenAI
 url = "https://www.paulgraham.com/greatwork.html"
 client = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")
 # Create vector store - auto-detects default embedding model
 vs = client.vector_stores.create()
 response = requests.get(url)
 pseudo_file = io.BytesIO(str(response.content).encode('utf-8'))
 file_id = client.files.create(file=(url, pseudo_file, "text/html"), purpose="assistants").id
 client.vector_stores.files.create(vector_store_id=vs.id, file_id=file_id)
 resp = client.responses.create(
    model="gpt-4o",
    input="How do you do great work? Use the existing knowledge_search tool.",
    tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
    include=["file_search_call.results"],
 )
 print(resp.output[-1].content[-1].text)
 ```
 Which should give output like:
 ```
 Doing great work is about more than just hard work and ambition; it involves combining several elements:
 1. **Pursue What Excites You**: Engage in projects that are both ambitious and exciting to you. It's important to work on something you have a natural aptitude for and a deep interest in.
 2. **Explore and Discover**: Great work often feels like a blend of discovery and creation. Focus on seeing possibilities and let ideas take their natural shape, rather than just executing a plan.
 3. **Be Bold Yet Flexible**: Take bold steps in your work without over-planning. An adaptable approach that evolves with new ideas can often lead to breakthroughs.
 4. **Work on Your Own Projects**: Develop a habit of working on projects of your own choosing, as these often lead to great achievements. These should be projects you find exciting and that challenge you intellectually.
 5. **Be Earnest and Authentic**: Approach your work with earnestness and authenticity. Trying to impress others with affectation can be counterproductive, as genuine effort and intellectual honesty lead to better work outcomes.
 6. **Build a Supportive Environment**: Work alongside great colleagues who inspire you and enhance your work. Surrounding yourself with motivating individuals creates a fertile environment for great work.
 7. **Maintain High Morale**: High morale significantly impacts your ability to do great work. Stay optimistic and protect your mental well-being to maintain progress and momentum.
 8. **Balance**: While hard work is essential, overworking can lead to diminishing returns. Balance periods of intensive work with rest to sustain productivity over time.
 This approach shows that great work is less about following a strict formula and more about aligning your interests, ambition, and environment to foster creativity and innovation.
 ```
 ## Architecture Overview
 Llama Stack provides OpenAI-compatible RAG capabilities through:
 - **Vector Stores API**: OpenAI-compatible vector storage with automatic embedding model detection
 - **Files API**: Document upload and processing using OpenAI's file format
 - **Responses API**: Enhanced chat completions with agentic tool calling via file search
 ## Configuring Default Embedding Models
 To enable automatic vector store creation without specifying embedding models, configure a default embedding model in your run.yaml like so:
 ```yaml
 vector_stores:
  default_provider_id: faiss
  default_embedding_model:
    provider_id: sentence-transformers
    model_id: nomic-ai/nomic-embed-text-v1.5
 ```
 With this configuration:
 - `client.vector_stores.create()` works without requiring embedding model or provider parameters
 - The system automatically uses the default vector store provider (`faiss`) when multiple providers are available
 - The system automatically uses the default embedding model (`sentence-transformers/nomic-ai/nomic-embed-text-v1.5`) for any newly created vector store
 - The `default_provider_id` specifies which vector storage backend to use
 - The `default_embedding_model` specifies both the inference provider and model for embeddings
 ## Vector Store Operations
 ### Creating Vector Stores
 You can create vector stores with automatic or explicit embedding model selection:
 ```python
 # Automatic - uses default configured embedding model and vector store provider
 vs = client.vector_stores.create()
 # Explicit - specify embedding model and/or provider when you need specific ones
 vs = client.vector_stores.create(
    extra_body={
        "provider_id": "faiss",  # Optional: specify vector store provider
        "embedding_model": "sentence-transformers/nomic-ai/nomic-embed-text-v1.5",
        "embedding_dimension": 768  # Optional: will be auto-detected if not provided
    }
 )
 ```
--- a/docs/source/building_applications/responses_vs_agents.md
+++ b/docs/source/building_applications/responses_vs_agents.md
@ -1,10 +1,20 @@
 ---
 title: Agents vs OpenAI Responses API
 description: Compare the Agents API and OpenAI Responses API for building AI applications with tool calling capabilities
 sidebar_label: Agents vs Responses API
 sidebar_position: 5
 ---
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Agents vs OpenAI Responses API
 Llama Stack (LLS) provides two different APIs for building AI applications with tool calling capabilities: the **Agents API** and the **OpenAI Responses API**. While both enable AI systems to use tools, and maintain full conversation history, they serve different use cases and have distinct characteristics.
-```{note}
+:::note
- **Note:** For simple and basic inferencing, you may want to use the [Chat Completions API](../providers/openai.md#chat-completions) directly, before progressing to Agents or Responses API.
+**Note:** For simple and basic inferencing, you may want to use the [Chat Completions API](../providers/openai#chat-completions) directly, before progressing to Agents or Responses API.
-```
+:::
 ## Overview
@ -21,6 +31,8 @@ Additionally, Agents let you specify input/output shields whereas Responses do n
 Today the Agents and Responses APIs can be used independently depending on the use case. But, it is also productive to treat the APIs as complementary. It is not currently supported, but it is planned for the LLS Agents API to alternatively use the Responses API as its backend instead of the default Chat Completions API, i.e., enabling a combination of the safety features of Agents with the dynamic configuration and branching capabilities of Responses.
 ## Feature Comparison
 | Feature | LLS Agents API | OpenAI Responses API |
 |---------|------------|---------------------|
 | **Conversation Management** | Linear persistent sessions | Can branch from any previous response ID |
@ -34,7 +46,10 @@ Let's compare how both APIs handle a research task where we need to:
 2. Access different information sources dynamically
 3. Continue the conversation based on search results
-### Agents API: Session-based configuration with safety shields
+<Tabs>
 <TabItem value="agents" label="Agents API">
 ### Session-based Configuration with Safety Shields
 ```python
 # Create agent with static session configuration
@ -85,7 +100,10 @@ print(f"First result: {response1.output_message.content}")
 print(f"Optimization: {response2.output_message.content}")
 ```
-### Responses API: Dynamic per-call configuration with branching
+</TabItem>
 <TabItem value="responses" label="Responses API">
 ### Dynamic Per-call Configuration with Branching
 ```python
 # First response: Use web search for latest algorithms
@ -130,50 +148,74 @@ print(f"File search results: {response2.output_message.content}")
 print(f"Alternative web search: {response3.output_message.content}")
 ```
 </TabItem>
 </Tabs>
 Both APIs demonstrate distinct strengths that make them valuable on their own for different scenarios. The Agents API excels in providing structured, safety-conscious workflows with persistent session management, while the Responses API offers flexibility through dynamic configuration and OpenAI compatible tool patterns.
 ## Use Case Examples
-### 1. **Research and Analysis with Safety Controls**
+### 1. Research and Analysis with Safety Controls
 **Best Choice: Agents API**
 **Scenario:** You're building a research assistant for a financial institution that needs to analyze market data, execute code to process financial models, and search through internal compliance documents. The system must ensure all interactions are logged for regulatory compliance and protected by safety shields to prevent malicious code execution or data leaks.
 **Why Agents API?** The Agents API provides persistent session management for iterative research workflows, built-in safety shields to protect against malicious code in financial models, and structured execution logs (session/turn/step) required for regulatory compliance. The static tool configuration ensures consistent access to your knowledge base and code interpreter throughout the entire research session.
-### 2. **Dynamic Information Gathering with Branching Exploration**
+### 2. Dynamic Information Gathering with Branching Exploration
 **Best Choice: Responses API**
 **Scenario:** You're building a competitive intelligence tool that helps businesses research market trends. Users need to dynamically switch between web search for current market data and file search through uploaded industry reports. They also want to branch conversations to explore different market segments simultaneously and experiment with different models for various analysis types.
 **Why Responses API?** The Responses API's branching capability lets users explore multiple market segments from any research point. Dynamic per-call configuration allows switching between web search and file search as needed, while experimenting with different models (faster models for quick searches, more powerful models for deep analysis). The OpenAI-compatible tool patterns make integration straightforward.
-### 3. **OpenAI Migration with Advanced Tool Capabilities**
+### 3. OpenAI Migration with Advanced Tool Capabilities
 **Best Choice: Responses API**
 **Scenario:** You have an existing application built with OpenAI's Assistants API that uses file search and web search capabilities. You want to migrate to Llama Stack for better performance and cost control while maintaining the same tool calling patterns and adding new capabilities like dynamic vector store selection.
 **Why Responses API?** The Responses API provides full OpenAI tool compatibility (`web_search`, `file_search`) with identical syntax, making migration seamless. The dynamic per-call configuration enables advanced features like switching vector stores per query or changing models based on query complexity - capabilities that extend beyond basic OpenAI functionality while maintaining compatibility.
-### 4. **Educational Programming Tutor**
+### 4. Educational Programming Tutor
 **Best Choice: Agents API**
 **Scenario:** You're building a programming tutor that maintains student context across multiple sessions, safely executes code exercises, and tracks learning progress with audit trails for educators.
 **Why Agents API?** Persistent sessions remember student progress across multiple interactions, safety shields prevent malicious code execution while allowing legitimate programming exercises, and structured execution logs help educators track learning patterns.
-### 5. **Advanced Software Debugging Assistant**
+### 5. Advanced Software Debugging Assistant
 **Best Choice: Agents API with Responses Backend**
 **Scenario:** You're building a debugging assistant that helps developers troubleshoot complex issues. It needs to maintain context throughout a debugging session, safely execute diagnostic code, switch between different analysis tools dynamically, and branch conversations to explore multiple potential causes simultaneously.
 **Why Agents + Responses?** The Agent provides safety shields for code execution and session management for the overall debugging workflow. The underlying Responses API enables dynamic model selection and flexible tool configuration per query, while branching lets you explore different theories (memory leak vs. concurrency issue) from the same debugging point and compare results.
-> **Note:** The ability to use Responses API as the backend for Agents is not yet implemented but is planned for a future release. Currently, Agents use Chat Completions API as their backend by default.
+:::info[Future Enhancement]
 The ability to use Responses API as the backend for Agents is not yet implemented but is planned for a future release. Currently, Agents use Chat Completions API as their backend by default.
 :::
-## For More Information
+## Decision Framework
- **LLS Agents API**: For detailed information on creating and managing agents, see the [Agents documentation](agent.md)
+Use this framework to choose the right API for your use case:
- **OpenAI Responses API**: For information on using the OpenAI-compatible responses API, see the [OpenAI API documentation](https://platform.openai.com/docs/api-reference/responses)
+
- **Chat Completions API**: For the default backend API used by Agents, see the [Chat Completions providers documentation](../providers/openai.md#chat-completions)
+### Choose Agents API when:
- **Agent Execution Loop**: For understanding how agents process turns and steps in their execution, see the [Agent Execution Loop documentation](agent_execution_loop.md)
+- ✅ You need **safety shields** for input/output validation
 - ✅ Your application requires **linear conversation flow** with persistent context
 - ✅ You need **audit trails** and structured execution logs
 - ✅ Your tool configuration is **static** throughout the session
 - ✅ You're building **educational, financial, or enterprise** applications with compliance requirements
 ### Choose Responses API when:
 - ✅ You need **conversation branching** to explore multiple paths
 - ✅ You want **dynamic per-call configuration** (models, tools, vector stores)
 - ✅ You're **migrating from OpenAI** and want familiar tool patterns
 - ✅ You need **OpenAI compatibility** for existing workflows
 - ✅ Your application benefits from **flexible, experimental** interactions
 ## Related Resources
 - **[Agents](./agent)** - Understanding the Agents API fundamentals
 - **[Agent Execution Loop](./agent_execution_loop)** - How agents process turns and steps
 - **[Tools Integration](./tools)** - Adding capabilities to both APIs
 - **[OpenAI Compatibility](../providers/openai)** - Using OpenAI-compatible endpoints
 - **[Safety Guardrails](./safety)** - Implementing safety measures in agents
--- a/docs/docs/building_applications/safety.mdx
+++ b/docs/docs/building_applications/safety.mdx
@ -0,0 +1,394 @@
 ---
 title: Safety Guardrails
 description: Implement safety measures and content moderation in Llama Stack applications
 sidebar_label: Safety
 sidebar_position: 9
 ---
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Safety Guardrails
 Safety is a critical component of any AI application. Llama Stack provides a comprehensive Shield system that can be applied at multiple touchpoints to ensure responsible AI behavior and content moderation.
 ## Shield System Overview
 The Shield system in Llama Stack provides:
 - **Content filtering** for both input and output messages
 - **Multi-touchpoint protection** across your application flow
 - **Configurable safety policies** tailored to your use case
 - **Integration with agents** for automated safety enforcement
 ## Basic Shield Usage
 ### Registering a Safety Shield
 <Tabs>
 <TabItem value="registration" label="Shield Registration">
 ```python
 # Register a safety shield
 shield_id = "content_safety"
 client.shields.register(
    shield_id=shield_id,
    provider_shield_id="llama-guard-basic"
 )
 ```
 </TabItem>
 <TabItem value="manual-check" label="Manual Safety Check">
 ```python
 # Run content through shield manually
 response = client.safety.run_shield(
    shield_id=shield_id,
    messages=[{"role": "user", "content": "User message here"}]
 )
 if response.violation:
    print(f"Safety violation detected: {response.violation.user_message}")
    # Handle violation appropriately
 else:
    print("Content passed safety checks")
 ```
 </TabItem>
 </Tabs>
 ## Agent Integration
 Shields can be automatically applied to agent interactions for seamless safety enforcement:
 <Tabs>
 <TabItem value="input-shields" label="Input Shields">
 ```python
 from llama_stack_client import Agent
 # Create agent with input safety shields
 agent = Agent(
    client,
    model="meta-llama/Llama-3.2-3B-Instruct",
    instructions="You are a helpful assistant",
    input_shields=["content_safety"],  # Shield user inputs
    tools=["builtin::websearch"],
 )
 session_id = agent.create_session("safe_session")
 # All user inputs will be automatically screened
 response = agent.create_turn(
    messages=[{"role": "user", "content": "Tell me about AI safety"}],
    session_id=session_id,
 )
 ```
 </TabItem>
 <TabItem value="output-shields" label="Output Shields">
 ```python
 # Create agent with output safety shields
 agent = Agent(
    client,
    model="meta-llama/Llama-3.2-3B-Instruct",
    instructions="You are a helpful assistant",
    output_shields=["content_safety"],  # Shield agent outputs
    tools=["builtin::websearch"],
 )
 session_id = agent.create_session("safe_session")
 # All agent responses will be automatically screened
 response = agent.create_turn(
    messages=[{"role": "user", "content": "Help me with my research"}],
    session_id=session_id,
 )
 ```
 </TabItem>
 <TabItem value="both-shields" label="Input & Output Shields">
 ```python
 # Create agent with comprehensive safety coverage
 agent = Agent(
    client,
    model="meta-llama/Llama-3.2-3B-Instruct",
    instructions="You are a helpful assistant",
    input_shields=["content_safety"],   # Screen user inputs
    output_shields=["content_safety"],  # Screen agent outputs
    tools=["builtin::websearch"],
 )
 session_id = agent.create_session("fully_protected_session")
 # Both input and output are automatically protected
 response = agent.create_turn(
    messages=[{"role": "user", "content": "Research question here"}],
    session_id=session_id,
 )
 ```
 </TabItem>
 </Tabs>
 ## Available Shield Types
 ### Llama Guard Shields
 Llama Guard provides state-of-the-art content safety classification:
 <Tabs>
 <TabItem value="basic" label="Basic Llama Guard">
 ```python
 # Basic Llama Guard for general content safety
 client.shields.register(
    shield_id="llama_guard_basic",
    provider_shield_id="llama-guard-basic"
 )
 ```
 **Use Cases:**
 - General content moderation
 - Harmful content detection
 - Basic safety compliance
 </TabItem>
 <TabItem value="advanced" label="Advanced Llama Guard">
 ```python
 # Advanced Llama Guard with custom categories
 client.shields.register(
    shield_id="llama_guard_advanced",
    provider_shield_id="llama-guard-advanced",
    config={
        "categories": [
            "violence", "hate_speech", "sexual_content",
            "self_harm", "illegal_activity"
        ],
        "threshold": 0.8
    }
 )
 ```
 **Use Cases:**
 - Fine-tuned safety policies
 - Domain-specific content filtering
 - Enterprise compliance requirements
 </TabItem>
 </Tabs>
 ### Custom Safety Shields
 Create domain-specific safety shields for specialized use cases:
 ```python
 # Register custom safety shield
 client.shields.register(
    shield_id="financial_compliance",
    provider_shield_id="custom-financial-shield",
    config={
        "detect_pii": True,
        "financial_advice_warning": True,
        "regulatory_compliance": "FINRA"
    }
 )
 ```
 ## Safety Response Handling
 When safety violations are detected, handle them appropriately:
 <Tabs>
 <TabItem value="basic-handling" label="Basic Handling">
 ```python
 response = client.safety.run_shield(
    shield_id="content_safety",
    messages=[{"role": "user", "content": "Potentially harmful content"}]
 )
 if response.violation:
    violation = response.violation
    print(f"Violation Type: {violation.violation_type}")
    print(f"User Message: {violation.user_message}")
    print(f"Metadata: {violation.metadata}")
    # Log the violation for audit purposes
    logger.warning(f"Safety violation detected: {violation.violation_type}")
    # Provide appropriate user feedback
    return "I can't help with that request. Please try asking something else."
 ```
 </TabItem>
 <TabItem value="advanced-handling" label="Advanced Handling">
 ```python
 def handle_safety_response(safety_response, user_message):
    """Advanced safety response handling with logging and user feedback"""
    if not safety_response.violation:
        return {"safe": True, "message": "Content passed safety checks"}
    violation = safety_response.violation
    # Log violation details
    audit_log = {
        "timestamp": datetime.now().isoformat(),
        "violation_type": violation.violation_type,
        "original_message": user_message,
        "shield_response": violation.user_message,
        "metadata": violation.metadata
    }
    logger.warning(f"Safety violation: {audit_log}")
    # Determine appropriate response based on violation type
    if violation.violation_type == "hate_speech":
        user_feedback = "I can't engage with content that contains hate speech. Let's keep our conversation respectful."
    elif violation.violation_type == "violence":
        user_feedback = "I can't provide information that could promote violence. How else can I help you today?"
    else:
        user_feedback = "I can't help with that request. Please try asking something else."
    return {
        "safe": False,
        "user_feedback": user_feedback,
        "violation_details": audit_log
    }
 # Usage
 safety_result = handle_safety_response(response, user_input)
 if not safety_result["safe"]:
    return safety_result["user_feedback"]
 ```
 </TabItem>
 </Tabs>
 ## Safety Configuration Best Practices
 ### 🛡️ **Multi-Layer Protection**
 - Use both input and output shields for comprehensive coverage
 - Combine multiple shield types for different threat categories
 - Implement fallback mechanisms when shields fail
 ### 📊 **Monitoring & Auditing**
 - Log all safety violations for compliance and analysis
 - Monitor false positive rates to tune shield sensitivity
 - Track safety metrics across different use cases
 ### ⚙️ **Configuration Management**
 - Use environment-specific safety configurations
 - Implement A/B testing for shield effectiveness
 - Regularly update shield models and policies
 ### 🔧 **Integration Patterns**
 - Integrate shields early in the development process
 - Test safety measures with adversarial inputs
 - Provide clear user feedback for violations
 ## Advanced Safety Scenarios
 ### Context-Aware Safety
 ```python
 # Safety shields that consider conversation context
 agent = Agent(
    client,
    model="meta-llama/Llama-3.2-3B-Instruct",
    instructions="You are a healthcare assistant",
    input_shields=["medical_safety"],
    output_shields=["medical_safety"],
    # Context helps shields make better decisions
    safety_context={
        "domain": "healthcare",
        "user_type": "patient",
        "compliance_level": "HIPAA"
    }
 )
 ```
 ### Dynamic Shield Selection
 ```python
 def select_shield_for_user(user_profile):
    """Select appropriate safety shield based on user context"""
    if user_profile.age < 18:
        return "child_safety_shield"
    elif user_profile.context == "enterprise":
        return "enterprise_compliance_shield"
    else:
        return "general_safety_shield"
 # Use dynamic shield selection
 shield_id = select_shield_for_user(current_user)
 response = client.safety.run_shield(
    shield_id=shield_id,
    messages=messages
 )
 ```
 ## Compliance and Regulations
 ### Industry-Specific Safety
 <Tabs>
 <TabItem value="healthcare" label="Healthcare (HIPAA)">
 ```python
 # Healthcare-specific safety configuration
 client.shields.register(
    shield_id="hipaa_compliance",
    provider_shield_id="healthcare-safety-shield",
    config={
        "detect_phi": True,  # Protected Health Information
        "medical_advice_warning": True,
        "regulatory_framework": "HIPAA"
    }
 )
 ```
 </TabItem>
 <TabItem value="financial" label="Financial (FINRA)">
 ```python
 # Financial services safety configuration
 client.shields.register(
    shield_id="finra_compliance",
    provider_shield_id="financial-safety-shield",
    config={
        "detect_financial_advice": True,
        "investment_disclaimers": True,
        "regulatory_framework": "FINRA"
    }
 )
 ```
 </TabItem>
 <TabItem value="education" label="Education (COPPA)">
 ```python
 # Educational platform safety for minors
 client.shields.register(
    shield_id="coppa_compliance",
    provider_shield_id="educational-safety-shield",
    config={
        "child_protection": True,
        "educational_content_only": True,
        "regulatory_framework": "COPPA"
    }
 )
 ```
 </TabItem>
 </Tabs>
 ## Related Resources
 - **[Agents](./agent)** - Integrating safety shields with intelligent agents
 - **[Agent Execution Loop](./agent_execution_loop)** - Understanding safety in the execution flow
 - **[Evaluations](./evals)** - Evaluating safety shield effectiveness
 - **[Llama Guard Documentation](https://github.com/meta-llama/PurpleLlama/tree/main/Llama-Guard3)** - Advanced safety model details
--- a/docs/docs/building_applications/telemetry.mdx
+++ b/docs/docs/building_applications/telemetry.mdx
@ -0,0 +1,212 @@
 ---
 title: Telemetry
 description: Monitor and observe Llama Stack applications with comprehensive telemetry capabilities
 sidebar_label: Telemetry
 sidebar_position: 8
 ---
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Telemetry
 The Llama Stack uses OpenTelemetry to provide comprehensive tracing, metrics, and logging capabilities.
 ## Automatic Metrics Generation
 Llama Stack automatically generates metrics during inference operations. These metrics are aggregated at the **inference request level** and provide insights into token usage and model performance.
 ### Available Metrics
 The following metrics are automatically generated for each inference request:
 | Metric Name | Type | Unit | Description | Labels |
 |-------------|------|------|-------------|--------|
 | `llama_stack_prompt_tokens_total` | Counter | `tokens` | Number of tokens in the input prompt | `model_id`, `provider_id` |
 | `llama_stack_completion_tokens_total` | Counter | `tokens` | Number of tokens in the generated response | `model_id`, `provider_id` |
 | `llama_stack_tokens_total` | Counter | `tokens` | Total tokens used (prompt + completion) | `model_id`, `provider_id` |
 ### Metric Generation Flow
 1. **Token Counting**: During inference operations (chat completion, completion, etc.), the system counts tokens in both input prompts and generated responses
 2. **Metric Construction**: For each request, `MetricEvent` objects are created with the token counts
 3. **Telemetry Logging**: Metrics are sent to the configured telemetry sinks
 4. **OpenTelemetry Export**: When OpenTelemetry is enabled, metrics are exposed as standard OpenTelemetry counters
 ### Metric Aggregation Level
 All metrics are generated and aggregated at the **inference request level**. This means:
 - Each individual inference request generates its own set of metrics
 - Metrics are not pre-aggregated across multiple requests
 - Aggregation (sums, averages, etc.) can be performed by your observability tools (Prometheus, Grafana, etc.)
 - Each metric includes labels for `model_id` and `provider_id` to enable filtering and grouping
 ### Example Metric Event
 ```python
 MetricEvent(
    trace_id="1234567890abcdef",
    span_id="abcdef1234567890",
    metric="total_tokens",
    value=150,
    timestamp=1703123456.789,
    unit="tokens",
    attributes={
        "model_id": "meta-llama/Llama-3.2-3B-Instruct",
        "provider_id": "tgi"
    },
 )
 ```
 ## Telemetry Sinks
 Choose from multiple sink types based on your observability needs:
 <Tabs>
 <TabItem value="opentelemetry" label="OpenTelemetry">
 Send events to an OpenTelemetry Collector for integration with observability platforms:
 **Use Cases:**
 - Visualizing traces in tools like Jaeger
 - Collecting metrics for Prometheus
 - Integration with enterprise observability stacks
 **Features:**
 - Standard OpenTelemetry format
 - Compatible with all OpenTelemetry collectors
 - Supports both traces and metrics
 </TabItem>
 <TabItem value="console" label="Console">
 Print events to the console for immediate debugging:
 **Use Cases:**
 - Development and testing
 - Quick debugging sessions
 - Simple logging without external tools
 **Features:**
 - Immediate output visibility
 - No setup required
 - Human-readable format
 </TabItem>
 </Tabs>
 ## Configuration
 ### Meta-Reference Provider
 Currently, only the meta-reference provider is implemented. It can be configured to send events to multiple sink types:
 ```yaml
 telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      service_name: "llama-stack-service"
      sinks: ['console', 'otel_trace', 'otel_metric']
      otel_exporter_otlp_endpoint: "http://localhost:4318"
 ```
 ### Environment Variables
 Configure telemetry behavior using environment variables:
 - **`OTEL_EXPORTER_OTLP_ENDPOINT`**: OpenTelemetry Collector endpoint (default: `http://localhost:4318`)
 - **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string)
 - **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `[]`)
 ### Quick Setup: Complete Telemetry Stack
 Use the automated setup script to launch the complete telemetry stack (Jaeger, OpenTelemetry Collector, Prometheus, and Grafana):
 ```bash
 ./scripts/telemetry/setup_telemetry.sh
 ```
 This sets up:
 - **Jaeger UI**: http://localhost:16686 (traces visualization)
 - **Prometheus**: http://localhost:9090 (metrics)
 - **Grafana**: http://localhost:3000 (dashboards with auto-configured data sources)
 - **OTEL Collector**: http://localhost:4318 (OTLP endpoint)
 Once running, you can visualize traces by navigating to [Grafana](http://localhost:3000/) and login with login `admin` and password `admin`.
 ## Querying Metrics
 When using the OpenTelemetry sink, metrics are exposed in standard format and can be queried through various tools:
 <Tabs>
 <TabItem value="prometheus" label="Prometheus Queries">
 Example Prometheus queries for analyzing token usage:
 ```promql
 # Total tokens used across all models
 sum(llama_stack_tokens_total)
 # Tokens per model
 sum by (model_id) (llama_stack_tokens_total)
 # Average tokens per request over 5 minutes
 rate(llama_stack_tokens_total[5m])
 # Token usage by provider
 sum by (provider_id) (llama_stack_tokens_total)
 ```
 </TabItem>
 <TabItem value="grafana" label="Grafana Dashboards">
 Create dashboards using Prometheus as a data source:
 - **Token Usage Over Time**: Line charts showing token consumption trends
 - **Model Performance**: Comparison of different models by token efficiency
 - **Provider Analysis**: Breakdown of usage across different providers
 - **Request Patterns**: Understanding peak usage times and patterns
 </TabItem>
 <TabItem value="otlp" label="OpenTelemetry Collector">
 Forward metrics to other observability systems:
 - Export to multiple backends simultaneously
 - Apply transformations and filtering
 - Integrate with existing monitoring infrastructure
 </TabItem>
 </Tabs>
 ## Best Practices
 ### 🔍 **Monitoring Strategy**
 - Use OpenTelemetry for production environments
 - Set up alerts on key metrics like token usage and error rates
 ### 📊 **Metrics Analysis**
 - Track token usage trends to optimize costs
 - Monitor response times across different models
 - Analyze usage patterns to improve resource allocation
 ### 🚨 **Alerting & Debugging**
 - Set up alerts for unusual token consumption spikes
 - Use trace data to debug performance issues
 - Monitor error rates and failure patterns
 ### 🔧 **Configuration Management**
 - Use environment variables for flexible deployment
 - Ensure proper network access to OpenTelemetry collectors
 ## Related Resources
 - **[Agents](./agent)** - Monitoring agent execution with telemetry
 - **[Evaluations](./evals)** - Using telemetry data for performance evaluation
 - **[Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)** - Telemetry examples and queries
 - **[OpenTelemetry Documentation](https://opentelemetry.io/)** - Comprehensive observability framework
 - **[Jaeger Documentation](https://www.jaegertracing.io/)** - Distributed tracing visualization
--- a/docs/source/building_applications/tools.md
+++ b/docs/source/building_applications/tools.md
@ -1,6 +1,17 @@
 ---
 title: Tools
 description: Extend agent capabilities with external tools and function calling
 sidebar_label: Tools
 sidebar_position: 6
 ---
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Tools
 Tools are functions that can be invoked by an agent to perform tasks. They are organized into tool groups and registered with specific providers. Each tool group represents a collection of related tools from a single provider. They are organized into groups so that state can be externalized: the collection operates on the same state typically.
 An example of this would be a "db_access" tool group that contains tools for interacting with a database. "list_tables", "query_table", "insert_row" could be examples of tools in this group.
 Tools are treated as any other resource in llama stack like models. You can register them, have providers for them etc.
@ -9,18 +20,15 @@ When instantiating an agent, you can provide it a list of tool groups that it ha
 Refer to the [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) notebook for more examples on how to use tools.
-## Server-side vs. client-side tool execution
+## Server-side vs. Client-side Tool Execution
-Llama Stack allows you to use both server-side and client-side tools. With server-side tools, `agent.create_turn` can perform execution of the tool calls emitted by the model
+Llama Stack allows you to use both server-side and client-side tools. With server-side tools, `agent.create_turn` can perform execution of the tool calls emitted by the model transparently giving the user the final answer desired. If client-side tools are provided, the tool call is sent back to the user for execution and optional continuation using the `agent.resume_turn` method.
 transparently giving the user the final answer desired. If client-side tools are provided, the tool call is sent back to the user for execution
 and optional continuation using the `agent.resume_turn` method.
-
+## Server-side Tools
 ### Server-side tools
 Llama Stack provides built-in providers for some common tools. These include web search, math, and RAG capabilities.
-#### Web Search
+### Web Search
 You have three providers to execute the web search tool calls generated by a model: Brave Search, Bing Search, and Tavily Search.
@ -39,25 +47,26 @@ The tool requires an API key which can be provided either in the configuration o
 {"<provider_name>_api_key": <your api key>}
 ```
-
+### Math
 #### Math
 The WolframAlpha tool provides access to computational knowledge through the WolframAlpha API.
 ```python
 client.toolgroups.register(
-    toolgroup_id="builtin::wolfram_alpha", provider_id="wolfram-alpha"
+    toolgroup_id="builtin::wolfram_alpha",
    provider_id="wolfram-alpha"
 )
 ```
 Example usage:
 ```python
 result = client.tool_runtime.invoke_tool(
-    tool_name="wolfram_alpha", args={"query": "solve x^2 + 2x + 1 = 0"}
+    tool_name="wolfram_alpha",
    args={"query": "solve x^2 + 2x + 1 = 0"}
 )
 ```
-#### RAG
+### RAG
 The RAG tool enables retrieval of context from various types of memory banks (vector, key-value, keyword, and graph).
@ -75,16 +84,13 @@ Features:
 - Configurable query generation
 - Context retrieval with token limits
-
+:::note[Default Configuration]
 ```{note}
 By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
-```
+:::
 ## Model Context Protocol (MCP)
-[MCP](https://github.com/modelcontextprotocol) is an upcoming, popular standard for tool discovery and execution. It is a protocol that allows tools to be dynamically discovered
+[MCP](https://github.com/modelcontextprotocol) is an upcoming, popular standard for tool discovery and execution. It is a protocol that allows tools to be dynamically discovered from an MCP endpoint and can be used to extend the agent's capabilities.
 from an MCP endpoint and can be used to extend the agent's capabilities.
 ### Using Remote MCP Servers
@ -98,8 +104,7 @@ client.toolgroups.register(
 )
 ```
-Note that most of the more useful MCP servers need you to authenticate with them. Many of them use OAuth2.0 for authentication. You can provide authorization headers to send to the MCP server
+Note that most of the more useful MCP servers need you to authenticate with them. Many of them use OAuth2.0 for authentication. You can provide authorization headers to send to the MCP server using the "Provider Data" abstraction provided by Llama Stack. When making an agent call,
 using the "Provider Data" abstraction provided by Llama Stack. When making an agent call,
 ```python
 agent = Agent(
@ -120,20 +125,26 @@ agent = Agent(
 agent.create_turn(...)
 ```
-### Running your own MCP server
+### Running Your Own MCP Server
 Here's an example of how to run a simple MCP server that exposes a File System as a set of tools to the Llama Stack agent.
 <Tabs>
 <TabItem value="setup" label="Server Setup">
 ```shell
-# start your MCP server
+# Start your MCP server
 mkdir /tmp/content
 touch /tmp/content/foo
 touch /tmp/content/bar
 npx -y supergateway --port 8000 --stdio 'npx -y @modelcontextprotocol/server-filesystem /tmp/content'
 ```
-Then register the MCP server as a tool group,
+</TabItem>
 <TabItem value="register" label="Registration">
 ```python
 # Register the MCP server as a tool group
 client.toolgroups.register(
    toolgroup_id="mcp::filesystem",
    provider_id="model-context-protocol",
@ -141,12 +152,12 @@ client.toolgroups.register(
 )
 ```
-
+</TabItem>
 </Tabs>
 ## Adding Custom (Client-side) Tools
-When you want to use tools other than the built-in tools, you just need to implement a python function with a docstring. The content of the docstring will be used to describe the tool and the parameters and passed
+When you want to use tools other than the built-in tools, you just need to implement a python function with a docstring. The content of the docstring will be used to describe the tool and the parameters and passed along to the generative model.
 along to the generative model.
 ```python
 # Example tool definition
@ -158,16 +169,19 @@ def my_tool(input: int) -> int:
    """
    return input * 2
 ```
-> **NOTE:** We employ python docstrings to describe the tool and the parameters. It is important to document the tool and the parameters so that the model can use the tool correctly. It is recommended to experiment with different docstrings to see how they affect the model's behavior.
+
 :::tip[Documentation Best Practices]
 We employ python docstrings to describe the tool and the parameters. It is important to document the tool and the parameters so that the model can use the tool correctly. It is recommended to experiment with different docstrings to see how they affect the model's behavior.
 :::
 Once defined, simply pass the tool to the agent config. `Agent` will take care of the rest (calling the model with the tool definition, executing the tool, and returning the result to the model for the next iteration).
 ```python
 # Example agent config with client provided tools
 agent = Agent(client, ..., tools=[my_tool])
 ```
-Refer to [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/blob/main/examples/agents/e2e_loop_with_client_tools.py) for an example of how to use client provided tools.
+Refer to [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/) for an example of how to use client provided tools.
 ## Tool Invocation
@ -175,7 +189,8 @@ Tools can be invoked using the `invoke_tool` method:
 ```python
 result = client.tool_runtime.invoke_tool(
-    tool_name="web_search", kwargs={"query": "What is the capital of France?"}
+    tool_name="web_search",
    kwargs={"query": "What is the capital of France?"}
 )
 ```
@ -196,16 +211,22 @@ all_tools = client.tools.list_tools()
 group_tools = client.tools.list_tools(toolgroup_id="search_tools")
 ```
-## Simple Example 2: Using an Agent with the Web Search Tool
+## Complete Examples
 ### Web Search Agent
 <Tabs>
 <TabItem value="setup" label="Setup & Configuration">
 1. Start by registering a Tavily API key at [Tavily](https://tavily.com/).
-2. [Optional] Provide the API key directly to the Llama Stack server
+2. [Optional] Set the API key in your environment before starting the Llama Stack server
 ```bash
 export TAVILY_SEARCH_API_KEY="your key"
 ```
-```bash
+
--env TAVILY_SEARCH_API_KEY=${TAVILY_SEARCH_API_KEY}
+</TabItem>
-```
+<TabItem value="implementation" label="Implementation">
-3. Run the following script.
+
 ```python
 from llama_stack_client.lib.agents.agent import Agent
 from llama_stack_client.types.agent_create_params import AgentConfig
@ -240,11 +261,18 @@ for log in EventLogger().log(response):
    log.print()
 ```
-## Simple Example3: Using an Agent with the WolframAlpha Tool
+</TabItem>
 </Tabs>
 ### WolframAlpha Math Agent
 <Tabs>
 <TabItem value="setup" label="Setup & Configuration">
 1. Start by registering for a WolframAlpha API key at [WolframAlpha Developer Portal](https://developer.wolframalpha.com/access).
-2. Provide the API key either when starting the Llama Stack server:
+2. Provide the API key either by setting it in your environment before starting the Llama Stack server:
    ```bash
-    --env WOLFRAM_ALPHA_API_KEY=${WOLFRAM_ALPHA_API_KEY}
+    export WOLFRAM_ALPHA_API_KEY="your key"
    ```
    or from the client side:
    ```python
@ -253,12 +281,57 @@ for log in EventLogger().log(response):
        provider_data={"wolfram_alpha_api_key": wolfram_api_key},
    )
    ```
-3. Configure the tools in the Agent by setting `tools=["builtin::wolfram_alpha"]`.
+
-4. Example user query:
+</TabItem>
 <TabItem value="implementation" label="Implementation">
 ```python
 # Configure the tools in the Agent by setting tools=["builtin::wolfram_alpha"]
 agent = Agent(
    client,
    model="meta-llama/Llama-3.2-3B-Instruct",
    instructions="You are a mathematical assistant that can solve complex equations.",
    tools=["builtin::wolfram_alpha"],
 )
 session_id = agent.create_session("math-session")
 # Example user query
 response = agent.create_turn(
    messages=[{"role": "user", "content": "Solve x^2 + 2x + 1 = 0 using WolframAlpha"}],
    session_id=session_id,
 )
 ```
-```
+
 </TabItem>
 </Tabs>
 ## Best Practices
 ### 🛠️ **Tool Selection**
 - Use **server-side tools** for production applications requiring reliability and security
 - Use **client-side tools** for development, prototyping, or specialized integrations
 - Combine multiple tool types for comprehensive functionality
 ### 📝 **Documentation**
 - Write clear, detailed docstrings for custom tools
 - Include parameter descriptions and expected return types
 - Test tool descriptions with the model to ensure proper usage
 ### 🔐 **Security**
 - Store API keys securely using environment variables or secure configuration
 - Use the `X-LlamaStack-Provider-Data` header for dynamic authentication
 - Validate tool inputs and outputs for security
 ### 🔄 **Error Handling**
 - Implement proper error handling in custom tools
 - Use structured error responses with meaningful messages
 - Monitor tool performance and reliability
 ## Related Resources
 - **[Agents](./agent)** - Building intelligent agents with tools
 - **[RAG (Retrieval Augmented Generation)](./rag)** - Using knowledge retrieval tools
 - **[Agent Execution Loop](./agent_execution_loop)** - Understanding tool execution flow
 - **[Building AI Applications Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)** - Comprehensive examples
 - **[Llama Stack Apps Examples](https://github.com/meta-llama/llama-stack-apps)** - Real-world tool implementations
--- a/docs/docs/concepts/apis/api_leveling.mdx
+++ b/docs/docs/concepts/apis/api_leveling.mdx
@ -1,3 +1,10 @@
 ---
 title: API Stability Leveling
 description: Understanding API stability levels and versioning in Llama Stack
 sidebar_label: API Stability
 sidebar_position: 4
 ---
 # Llama Stack API Stability Leveling
 In order to provide a stable experience in Llama Stack, the various APIs need different stability levels indicating the level of support, backwards compatability, and overall production readiness.
@ -55,6 +62,10 @@ The new `/v2` API must be introduced alongside the existing `/v1` API and run in
 When a `/v2` API is introduced, a clear and generous deprecation policy for the `/v1` API must be published simultaneously. This policy must outline the timeline for the eventual removal of the `/v1` API, giving users ample time to migrate.
 ### Deprecated APIs
 Deprecated APIs are those that are no longer actively maintained or supported. Depreated APIs are marked with the flag `deprecated = True` in the OpenAPI spec. These APIs will be removed in a future release.
 ### API Stability vs. Provider Stability
 The leveling introduced in this document relates to the stability of the API and not specifically the providers within the API.
--- a/docs/docs/concepts/apis/api_providers.mdx
+++ b/docs/docs/concepts/apis/api_providers.mdx
@ -1,4 +1,11 @@
-## API Providers
+---
 title: API Providers
 description: Understanding remote vs inline provider implementations
 sidebar_label: API Providers
 sidebar_position: 2
 ---
 # API Providers
 The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
 - LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, etc.),
--- a/docs/docs/concepts/apis/external.mdx
+++ b/docs/docs/concepts/apis/external.mdx
@ -1,3 +1,9 @@
 ---
 title: External APIs
 description: Understanding external APIs in Llama Stack
 sidebar_label: External APIs
 sidebar_position: 3
 ---
 # External APIs
 Llama Stack supports external APIs that live outside of the main codebase. This allows you to:
@ -146,7 +152,6 @@ __all__ = ["WeatherAPI", "available_providers"]
 from typing import Protocol
 from llama_stack.providers.datatypes import (
    AdapterSpec,
    Api,
    ProviderSpec,
    RemoteProviderSpec,
@ -160,13 +165,11 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.weather,
            provider_type="remote::kaze",
            config_class="llama_stack_provider_kaze.KazeProviderConfig",
            adapter=AdapterSpec(
            adapter_type="kaze",
            module="llama_stack_provider_kaze",
            pip_packages=["llama_stack_provider_kaze"],
            config_class="llama_stack_provider_kaze.KazeProviderConfig",
        ),
        ),
    ]
@ -319,7 +322,6 @@ class WeatherKazeAdapter(WeatherProvider):
 ```yaml
 # ~/.llama/providers.d/remote/weather/kaze.yaml
 adapter:
 adapter_type: kaze
 pip_packages: ["llama_stack_provider_kaze"]
 config_class: llama_stack_provider_kaze.config.KazeProviderConfig
@ -355,7 +357,7 @@ server:
 8. Run the server:
 ```bash
-python -m llama_stack.core.server.server --yaml-config ~/.llama/run-byoa.yaml
+llama stack run ~/.llama/run-byoa.yaml
 ```
 9. Test the API:
--- a/docs/docs/concepts/apis/index.mdx
+++ b/docs/docs/concepts/apis/index.mdx
@ -1,4 +1,11 @@
-## APIs
+---
 title: APIs
 description: Available REST APIs and planned capabilities in Llama Stack
 sidebar_label: APIs
 sidebar_position: 1
 ---
 # APIs
 A Llama Stack API is described as a collection of REST endpoints. We currently support the following APIs:
@ -9,7 +16,6 @@ A Llama Stack API is described as a collection of REST endpoints. We currently s
 - **Scoring**: evaluate outputs of the system
 - **Eval**: generate outputs (via Inference or Agents) and perform scoring
 - **VectorIO**: perform operations on vector stores, such as adding documents, searching, and deleting documents
 - **Telemetry**: collect telemetry data from the system
 - **Post Training**: fine-tune a model
 - **Tool Runtime**: interact with various tools and protocols
 - **Responses**: generate responses from an LLM using this OpenAI compatible API.
--- a/docs/source/concepts/architecture.md
+++ b/docs/source/concepts/architecture.md
@ -1,15 +1,19 @@
-## Llama Stack architecture
+---
 title: Llama Stack Architecture
 description: Understanding Llama Stack's service-oriented design and benefits
 sidebar_label: Architecture
 sidebar_position: 2
 ---
 # Llama Stack architecture
 Llama Stack allows you to build different layers of distributions for your AI workloads using various SDKs and API providers.
-```{image} ../../_static/llama-stack.png
+<img src="/img/llama-stack.png" alt="Llama Stack" width="400" />
 :alt: Llama Stack
 :width: 400px
 ```
-### Benefits of Llama stack
+## Benefits of Llama stack
-#### Current challenges in custom AI applications
+### Current challenges in custom AI applications
 Building production AI applications today requires solving multiple challenges:
@ -32,7 +36,7 @@ Building production AI applications today requires solving multiple challenges:
 - Different providers have different APIs and abstractions.
 - Changing providers requires significant code changes.
-#### Our Solution: A Universal Stack
+### Our Solution: A Universal Stack
 Llama Stack addresses these challenges through a service-oriented, API-first approach:
@ -59,7 +63,7 @@ Llama Stack addresses these challenges through a service-oriented, API-first app
 - Ecosystem offers tailored infrastructure, software, and services for deploying a variety of models.
-### Our Philosophy
+## Our Philosophy
 - **Service-Oriented**: REST APIs enforce clean interfaces and enable seamless transitions across different environments.
 - **Composability**: Every component is independent but works together seamlessly
--- a/docs/source/concepts/distributions.md
+++ b/docs/source/concepts/distributions.md
@ -1,4 +1,11 @@
-## Distributions
+---
 title: Distributions
 description: Pre-packaged provider configurations for different deployment scenarios
 sidebar_label: Distributions
 sidebar_position: 3
 ---
 # Distributions
 While there is a lot of flexibility to mix-and-match providers, often users will work with a specific set of providers (hardware support, contractual obligations, etc.) We therefore need to provide a _convenient shorthand_ for such collections. We call this shorthand a **Llama Stack Distribution** or a **Distro**. One can think of it as specific pre-packaged versions of the Llama Stack. Here are some examples:
@ -6,4 +13,4 @@ While there is a lot of flexibility to mix-and-match providers, often users will
 **Locally Hosted Distro**: You may want to run Llama Stack on your own hardware. Typically though, you still need to use Inference via an external service. You can use providers like HuggingFace TGI, Fireworks, Together, etc. for this purpose. Or you may have access to GPUs and can run a [vLLM](https://github.com/vllm-project/vllm) or [NVIDIA NIM](https://build.nvidia.com/nim?filters=nimType%3Anim_type_run_anywhere&q=llama) instance. If you "just" have a regular desktop machine, you can use [Ollama](https://ollama.com/) for inference. To provide convenient quick access to these options, we provide a number of such pre-configured locally-hosted Distros.
-**On-device Distro**: To run Llama Stack directly on an edge device (mobile phone or a tablet), we provide Distros for [iOS](../distributions/ondevice_distro/ios_sdk.md) and [Android](../distributions/ondevice_distro/android_sdk.md)
+**On-device Distro**: To run Llama Stack directly on an edge device (mobile phone or a tablet), we provide Distros for [iOS](/docs/distributions/ondevice_distro/ios_sdk) and [Android](/docs/distributions/ondevice_distro/android_sdk)
--- a/docs/source/advanced_apis/evaluation_concepts.md
+++ b/docs/source/advanced_apis/evaluation_concepts.md
@ -1,16 +1,22 @@
-## Evaluation Concepts
+---
 title: Evaluation Concepts
 description: Running evaluations on Llama Stack
 sidebar_label: Evaluation Concepts
 sidebar_position: 5
 ---
 # Evaluation Concepts
 The Llama Stack Evaluation flow allows you to run evaluations on your GenAI application datasets or pre-registered benchmarks.
-We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
+We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications:
 - `/datasetio` + `/datasets` API
 - `/scoring` + `/scoring_functions` API
 - `/eval` + `/benchmarks` API
 This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
-
+The Evaluation APIs are associated with a set of Resources. Please visit the Resources section in our [Core Concepts](./index.mdx) guide for better high-level understanding.
 The Evaluation APIs are associated with a set of Resources. Please visit the Resources section in our [Core Concepts](../concepts/index.md) guide for better high-level understanding.
 - **DatasetIO**: defines interface with datasets and data loaders.
  - Associated with `Dataset` resource.
@ -19,10 +25,9 @@ The Evaluation APIs are associated with a set of Resources. Please visit the Res
 - **Eval**: generate outputs (via Inference or Agents) and perform scoring.
  - Associated with `Benchmark` resource.
 ## Open-benchmark Eval
-### Open-benchmark Eval
+### List of open-benchmarks Llama Stack support
 #### List of open-benchmarks Llama Stack support
 Llama stack pre-registers several popular open-benchmarks to easily evaluate model perfomance via CLI.
@ -32,19 +37,17 @@ The list of open-benchmarks we currently support:
 - [SimpleQA](https://openai.com/index/introducing-simpleqa/): Benchmark designed to access models to answer short, fact-seeking questions.
 - [MMMU](https://arxiv.org/abs/2311.16502) (A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI)]: Benchmark designed to evaluate multimodal models.
 You can follow this [contributing guide](../references/evals_reference/#open-benchmark-contributing-guide) to add more open-benchmarks to Llama Stack
-You can follow this [contributing guide](../references/evals_reference/index.md#open-benchmark-contributing-guide) to add more open-benchmarks to Llama Stack
+### Run evaluation on open-benchmarks via CLI
 #### Run evaluation on open-benchmarks via CLI
 We have built-in functionality to run the supported open-benckmarks using llama-stack-client CLI
 #### Spin up Llama Stack server
 Spin up llama stack server with 'open-benchmark' template
-```
+```bash
 llama stack run llama_stack/distributions/open-benchmark/run.yaml
 ```
 #### Run eval CLI
@ -52,26 +55,24 @@ There are 3 necessary inputs to run a benchmark eval
 - `list of benchmark_ids`: The list of benchmark ids to run evaluation on
 - `model-id`: The model id to evaluate on
 - `output_dir`: Path to store the evaluate results
-```
+
 ```bash
 llama-stack-client eval run-benchmark <benchmark_id_1> <benchmark_id_2> ... \
 --model_id <model id to evaluate on> \
--output_dir <directory to store the evaluate results> \
+--output_dir <directory to store the evaluate results>
 ```
 You can run
-```
+```bash
 llama-stack-client eval run-benchmark help
 ```
 to see the description of all the flags that eval run-benchmark has
 In the output log, you can find the file path that has your evaluation results. Open that file and you can see you aggregate
 evaluation results over there.
-
+## What's Next?
 #### What's Next?
 - Check out our Colab notebook on working examples with running benchmark evaluations [here](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb#scrollTo=mxLCsP4MvFqP).
- Check out our [Building Applications - Evaluation](../building_applications/evals.md) guide for more details on how to use the Evaluation APIs to evaluate your applications.
+- Check out our [Building Applications - Evaluation](../building_applications/evals.mdx) guide for more details on how to use the Evaluation APIs to evaluate your applications.
- Check out our [Evaluation Reference](../references/evals_reference/index.md) for more details on the APIs.
+- Check out our [Evaluation Reference](../references/evals_reference/) for more details on the APIs.
--- a/docs/docs/concepts/index.mdx
+++ b/docs/docs/concepts/index.mdx
@ -0,0 +1,31 @@
 ---
 title: Core Concepts
 description: Understanding Llama Stack's service-oriented philosophy and key concepts
 sidebar_label: Overview
 sidebar_position: 1
 ---
 Given Llama Stack's service-oriented philosophy, a few concepts and workflows arise which may not feel completely natural in the LLM landscape, especially if you are coming with a background in other frameworks.
 ## Documentation Structure
 This section covers the fundamental concepts of Llama Stack:
 - **[Architecture](architecture.mdx)** - Learn about Llama Stack's architectural design and principles
 - **[APIs](/docs/concepts/apis/)** - Understanding the core APIs and their stability levels
  - [API Overview](apis/index.mdx) - Core APIs available in Llama Stack
  - [API Providers](apis/api_providers.mdx) - How providers implement APIs
  - [External APIs](apis/external.mdx) - External APIs available in Llama Stack
  - [API Stability Leveling](apis/api_leveling.mdx) - API stability and versioning
 - **[Distributions](distributions.mdx)** - Pre-configured deployment packages
 - **[Resources](resources.mdx)** - Understanding Llama Stack resources and their lifecycle
 ## Getting Started
 If you're new to Llama Stack, we recommend starting with:
 1. **[Architecture](architecture.mdx)** - Understand the overall system design
 2. **[APIs](apis/index.mdx)** - Learn about the available APIs and their purpose
 3. **[Distributions](distributions.mdx)** - Choose a pre-configured setup for your use case
 Each concept builds upon the previous ones to give you a comprehensive understanding of how Llama Stack works and how to use it effectively.
--- a/docs/source/concepts/resources.md
+++ b/docs/source/concepts/resources.md
@ -1,4 +1,11 @@
-## Resources
+---
 title: Resources
 description: Resource federation and registration in Llama Stack
 sidebar_label: Resources
 sidebar_position: 4
 ---
 # Resources
 Some of these APIs are associated with a set of **Resources**. Here is the mapping of APIs to resources:
@ -12,8 +19,8 @@ Some of these APIs are associated with a set of **Resources**. Here is the mappi
 Furthermore, we allow these resources to be **federated** across multiple providers. For example, you may have some Llama models served by Fireworks while others are served by AWS Bedrock. Regardless, they will all work seamlessly with the same uniform Inference API provided by Llama Stack.
-```{admonition} Registering Resources
+:::tip Registering Resources
 :class: tip
 Given this architecture, it is necessary for the Stack to know which provider to use for a given resource. This means you need to explicitly _register_ resources (including models) before you can use them with the associated APIs.
-```
+
 :::
--- a/docs/docs/contributing/index.mdx
+++ b/docs/docs/contributing/index.mdx
@ -0,0 +1,13 @@
 ---
 title: Contributing
 description: Contributing to Llama Stack
 sidebar_label: Contributing to Llama Stack
 sidebar_position: 3
 hide_title: true
 ---
 import Contributing from '!!raw-loader!../../../CONTRIBUTING.md';
 import ReactMarkdown from 'react-markdown';
 <ReactMarkdown>{Contributing}</ReactMarkdown>
--- a/docs/source/contributing/new_api_provider.md
+++ b/docs/source/contributing/new_api_provider.md
@ -1,12 +1,20 @@
-# Adding a New API Provider
+---
 title: Adding a New API Provider
 description: Guide for adding new API providers to Llama Stack
 sidebar_label: New API Provider
 sidebar_position: 2
 ---
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 This guide will walk you through the process of adding a new API provider to Llama Stack.
- Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
+- Begin by reviewing the [core concepts](../concepts/) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
- Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
+- Determine the provider type ([Remote](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote) or [Inline](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline)). Remote providers make requests to external services, while inline providers execute implementation locally.
- Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
+- Add your provider to the appropriate [Registry](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/registry/). Specify pip dependencies necessary.
- Update any distribution {repopath}`Templates::llama_stack/distributions/` `build.yaml` and `run.yaml` files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
+- Update any distribution [Templates](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/distributions/) `build.yaml` and `run.yaml` files if they should include your provider by default. Run [./scripts/distro_codegen.py](https://github.com/meta-llama/llama-stack/blob/main/scripts/distro_codegen.py) if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
 Here are some example PRs to help you get started:
@ -59,23 +67,23 @@ def get_base_url(self) -> str:
 ## Testing the Provider
-Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, you should install dependencies via `llama stack build --distro together`.
+Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, install its dependencies with `llama stack list-deps together | xargs -L1 uv pip install`.
 ### 1. Integration Testing
-Integration tests are located in {repopath}`tests/integration`. These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality. Since these tests use client APIs, they can be run either by pointing to an instance of the Llama Stack server or "inline" by using `LlamaStackAsLibraryClient`.
+Integration tests are located in [tests/integration](https://github.com/meta-llama/llama-stack/tree/main/tests/integration). These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality. Since these tests use client APIs, they can be run either by pointing to an instance of the Llama Stack server or "inline" by using `LlamaStackAsLibraryClient`.
-Consult {repopath}`tests/integration/README.md` for more details on how to run the tests.
+Consult [tests/integration/README.md](https://github.com/meta-llama/llama-stack/blob/main/tests/integration/README.md) for more details on how to run the tests.
 Note that each provider's `sample_run_config()` method (in the configuration class for that provider)
- typically references some environment variables for specifying API keys and the like. You can set these in the environment or pass these via the `--env` flag to the test command.
+ typically references some environment variables for specifying API keys and the like. You can set these in the environment before running the test command.
 ### 2. Unit Testing
-Unit tests are located in {repopath}`tests/unit`. Provider-specific unit tests are located in {repopath}`tests/unit/providers`. These tests are all run automatically as part of the CI process.
+Unit tests are located in [tests/unit](https://github.com/meta-llama/llama-stack/tree/main/tests/unit). Provider-specific unit tests are located in [tests/unit/providers](https://github.com/meta-llama/llama-stack/tree/main/tests/unit/providers). These tests are all run automatically as part of the CI process.
-Consult {repopath}`tests/unit/README.md` for more details on how to run the tests manually.
+Consult [tests/unit/README.md](https://github.com/meta-llama/llama-stack/blob/main/tests/unit/README.md) for more details on how to run the tests manually.
 ### 3. Additional end-to-end testing
--- a/docs/source/contributing/new_vector_database.md
+++ b/docs/source/contributing/new_vector_database.md
@ -1,4 +1,12 @@
-# Adding a New Vector Database
+---
 title: Adding a New Vector Database
 description: Guide for adding new vector database providers to Llama Stack
 sidebar_label: New Vector Database
 sidebar_position: 3
 ---
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 This guide will walk you through the process of adding a new vector database to Llama Stack.
@ -31,7 +39,7 @@ filtering, sorting, and aggregating vectors.
        - `YourVectorIOAdapter.query_chunks()`
        - `YourVectorIOAdapter.delete_chunks()`
 3. **Add to Registry**: Register your provider in the appropriate registry file.
-   - Update {repopath}`llama_stack/providers/registry/vector_io.py` to include your new provider.
+   - Update [llama_stack/providers/registry/vector_io.py](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/registry/vector_io.py) to include your new provider.
 ```python
 from llama_stack.providers.registry.specs import InlineProviderSpec
 from llama_stack.providers.registry.api import Api
@ -57,7 +65,7 @@ InlineProviderSpec(
       5. Add your provider to the `vector_io_providers` fixture dictionary.
         - Please follow the naming convention of `your_vectorprovider_index` and `your_vectorprovider_adapter` as the tests require this to execute properly.
   - Integration Tests
-     - Integration tests are located in {repopath}`tests/integration`. These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality.
+     - Integration tests are located in [tests/integration](https://github.com/meta-llama/llama-stack/tree/main/tests/integration). These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality.
     - The two set of integration tests are:
       - `tests/integration/vector_io/test_vector_io.py`: This file tests registration, insertion, and retrieval.
       - `tests/integration/vector_io/test_openai_vector_stores.py`: These tests are for OpenAI-compatible vector stores and test the OpenAI API compatibility.
@ -71,5 +79,5 @@ InlineProviderSpec(
     - If you are adding tests for the `remote` provider you will have to update the `test` group, which is used in the GitHub CI for integration tests.
       - `uv add new_pip_package --group test`
 5. **Update Documentation**: Please update the documentation for end users
-   - Generate the provider documentation by running {repopath}`./scripts/provider_codegen.py`.
+    - Generate the provider documentation by running [./scripts/provider_codegen.py](https://github.com/meta-llama/llama-stack/blob/main/scripts/provider_codegen.py).
    - Update the autogenerated content in the registry/vector_io.py file with information about your provider. Please see other providers for examples.
--- a/docs/source/contributing/testing/record-replay.md
+++ b/docs/source/contributing/testing/record-replay.md
@ -1,3 +1,13 @@
 ---
 title: Record-Replay Testing System
 description: Understanding how Llama Stack captures and replays API interactions for testing
 sidebar_label: Record-Replay System
 sidebar_position: 4
 ---
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Record-Replay System
 Understanding how Llama Stack captures and replays API interactions for testing.
@ -58,7 +68,9 @@ recordings/
 Direct API calls with no recording or replay:
 ```python
-with inference_recording(mode=InferenceMode.LIVE):
+from llama_stack.testing.api_recorder import api_recording, APIRecordingMode
 with api_recording(mode=APIRecordingMode.LIVE):
    response = await client.chat.completions.create(...)
 ```
@ -69,7 +81,7 @@ Use for initial development and debugging against real APIs.
 Captures API interactions while passing through real responses:
 ```python
-with inference_recording(mode=InferenceMode.RECORD, storage_dir="./recordings"):
+with api_recording(mode=APIRecordingMode.RECORD, storage_dir="./recordings"):
    response = await client.chat.completions.create(...)
    # Real API call made, response captured AND returned
 ```
@ -86,7 +98,7 @@ The recording process:
 Returns stored responses instead of making API calls:
 ```python
-with inference_recording(mode=InferenceMode.REPLAY, storage_dir="./recordings"):
+with api_recording(mode=APIRecordingMode.REPLAY, storage_dir="./recordings"):
    response = await client.chat.completions.create(...)
    # No API call made, cached response returned instantly
 ```
--- a/docs/docs/deploying/aws_eks_deployment.mdx
+++ b/docs/docs/deploying/aws_eks_deployment.mdx
@ -0,0 +1,30 @@
 ---
 title: AWS EKS Deployment Guide
 description: Deploy Llama Stack on AWS EKS
 sidebar_label: AWS EKS Deployment
 sidebar_position: 3
 ---
 ## AWS EKS Deployment
 ### Prerequisites
 - Set up an [EKS cluster](https://docs.aws.amazon.com/eks/latest/userguide/getting-started.html)
 - Create a [GitHub OAuth app](https://docs.github.com/en/apps/oauth-apps/building-oauth-apps/creating-an-oauth-app)
 - Set authorization callback URL to `http://<your-llama-stack-ui-url>/api/auth/callback/`
 ### Automated Deployment
 ```bash
 export HF_TOKEN=<your-huggingface-token>
 export GITHUB_CLIENT_ID=<your-github-client-id>
 export GITHUB_CLIENT_SECRET=<your-github-client-secret>
 export LLAMA_STACK_UI_URL=<your-llama-stack-ui-url>
 cd docs/source/distributions/eks
 ./apply.sh
 ```
 This script will:
 - Set up default storage class for AWS EKS
 - Deploy Llama Stack server in Kubernetes pods and services
--- a/docs/docs/deploying/index.mdx
+++ b/docs/docs/deploying/index.mdx
@ -0,0 +1,14 @@
 ---
 title: Deploying Llama Stack
 description: Production deployment guides for Llama Stack in various environments
 sidebar_label: Overview
 sidebar_position: 1
 ---
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Deploying Llama Stack
 [**→ Kubernetes Deployment Guide**](./kubernetes_deployment.mdx)
 [**→ AWS EKS Deployment Guide**](./aws_eks_deployment.mdx)
--- a/docs/docs/deploying/kubernetes_deployment.mdx
+++ b/docs/docs/deploying/kubernetes_deployment.mdx
@ -0,0 +1,224 @@
 ---
 title: Kubernetes Deployment Guide
 description: Deploy Llama Stack on Kubernetes clusters with vLLM inference service
 sidebar_label: Kubernetes
 sidebar_position: 2
 ---
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Kubernetes Deployment Guide
 Deploy Llama Stack and vLLM servers in a Kubernetes cluster instead of running them locally. This guide covers both local development with Kind and production deployment on AWS EKS.
 ## Prerequisites
 ### Local Kubernetes Setup
 Create a local Kubernetes cluster via Kind:
 ```bash
 kind create cluster --image kindest/node:v1.32.0 --name llama-stack-test
 ```
 Set your Hugging Face token:
 ```bash
 export HF_TOKEN=$(echo -n "your-hf-token" | base64)
 ```
 ## Quick Deployment
 ### Step 1: Create Storage and Secrets
 ```yaml
 cat <<EOF | kubectl apply -f -
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: vllm-models
 spec:
  accessModes:
    - ReadWriteOnce
  volumeMode: Filesystem
  resources:
    requests:
      storage: 50Gi
 ---
 apiVersion: v1
 kind: Secret
 metadata:
  name: hf-token-secret
 type: Opaque
 data:
  token: $HF_TOKEN
 EOF
 ```
 ### Step 2: Deploy vLLM Server
 ```yaml
 cat <<EOF | kubectl apply -f -
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: vllm-server
 spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: vllm
  template:
    metadata:
      labels:
        app.kubernetes.io/name: vllm
    spec:
      containers:
      - name: vllm
        image: vllm/vllm-openai:latest
        command: ["/bin/sh", "-c"]
        args: ["vllm serve meta-llama/Llama-3.2-1B-Instruct"]
        env:
        - name: HUGGING_FACE_HUB_TOKEN
          valueFrom:
            secretKeyRef:
              name: hf-token-secret
              key: token
        ports:
          - containerPort: 8000
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.cache/huggingface
      volumes:
      - name: llama-storage
        persistentVolumeClaim:
          claimName: vllm-models
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: vllm-server
 spec:
  selector:
    app.kubernetes.io/name: vllm
  ports:
  - protocol: TCP
    port: 8000
    targetPort: 8000
  type: ClusterIP
 EOF
 ```
 ### Step 3: Configure Llama Stack
 Update your run configuration:
 ```yaml
 providers:
  inference:
  - provider_id: vllm
    provider_type: remote::vllm
    config:
      url: http://vllm-server.default.svc.cluster.local:8000/v1
      max_tokens: 4096
      api_token: fake
 ```
 Build container image:
 ```bash
 tmp_dir=$(mktemp -d) && cat >$tmp_dir/Containerfile.llama-stack-run-k8s <<EOF
 FROM distribution-myenv:dev
 RUN apt-get update && apt-get install -y git
 RUN git clone https://github.com/meta-llama/llama-stack.git /app/llama-stack-source
 ADD ./vllm-llama-stack-run-k8s.yaml /app/config.yaml
 EOF
 podman build -f $tmp_dir/Containerfile.llama-stack-run-k8s -t llama-stack-run-k8s $tmp_dir
 ```
 ### Step 4: Deploy Llama Stack Server
 ```yaml
 cat <<EOF | kubectl apply -f -
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: llama-pvc
 spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 1Gi
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: llama-stack-server
 spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: llama-stack
  template:
    metadata:
      labels:
        app.kubernetes.io/name: llama-stack
    spec:
      containers:
      - name: llama-stack
        image: localhost/llama-stack-run-k8s:latest
        imagePullPolicy: IfNotPresent
        command: ["llama", "stack", "run", "/app/config.yaml"]
        ports:
          - containerPort: 5000
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.llama
      volumes:
      - name: llama-storage
        persistentVolumeClaim:
          claimName: llama-pvc
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: llama-stack-service
 spec:
  selector:
    app.kubernetes.io/name: llama-stack
  ports:
  - protocol: TCP
    port: 5000
    targetPort: 5000
  type: ClusterIP
 EOF
 ```
 ### Step 5: Test Deployment
 ```bash
 # Port forward and test
 kubectl port-forward service/llama-stack-service 5000:5000
 llama-stack-client --endpoint http://localhost:5000 inference chat-completion --message "hello, what model are you?"
 ```
 ## Troubleshooting
 **Check pod status:**
 ```bash
 kubectl get pods -l app.kubernetes.io/name=vllm
 kubectl logs -l app.kubernetes.io/name=vllm
 ```
 **Test service connectivity:**
 ```bash
 kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- curl http://vllm-server:8000/v1/models
 ```
 ## Related Resources
 - **[Deployment Overview](/docs/deploying/)** - Overview of deployment options
 - **[Distributions](/docs/distributions)** - Understanding Llama Stack distributions
 - **[Configuration](/docs/distributions/configuration)** - Detailed configuration options
--- a/docs/docs/distributions/building_distro.mdx
+++ b/docs/docs/distributions/building_distro.mdx
@ -0,0 +1,148 @@
 ---
 title: Building Custom Distributions
 description: Building a Llama Stack distribution from scratch
 sidebar_label: Build your own Distribution
 sidebar_position: 3
 ---
 This guide walks you through inspecting existing distributions, customising their configuration, and building runnable artefacts for your own deployment.
 ### Explore existing distributions
 All first-party distributions live under `llama_stack/distributions/`. Each directory contains:
 - `build.yaml` – the distribution specification (providers, additional dependencies, optional external provider directories).
 - `run.yaml` – sample run configuration (when provided).
 - Documentation fragments that power this site.
 Browse that folder to understand available providers and copy a distribution to use as a starting point. When creating a new stack, duplicate an existing directory, rename it, and adjust the `build.yaml` file to match your requirements.
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 <Tabs>
 <TabItem value="container" label="Building a container">
 Use the Containerfile at `containers/Containerfile`, which installs `llama-stack`, resolves distribution dependencies via `llama stack list-deps`, and sets the entrypoint to `llama stack run`.
 ```bash
 docker build . \
  -f containers/Containerfile \
  --build-arg DISTRO_NAME=starter \
  --tag llama-stack:starter
 ```
 Handy build arguments:
 - `DISTRO_NAME` – distribution directory name (defaults to `starter`).
 - `RUN_CONFIG_PATH` – absolute path inside the build context for a run config that should be baked into the image (e.g. `/workspace/run.yaml`).
 - `INSTALL_MODE=editable` – install the repository copied into `/workspace` with `uv pip install -e`. Pair it with `--build-arg LLAMA_STACK_DIR=/workspace`.
 - `LLAMA_STACK_CLIENT_DIR` – optional editable install of the Python client.
 - `PYPI_VERSION` / `TEST_PYPI_VERSION` – pin specific releases when not using editable installs.
 - `KEEP_WORKSPACE=1` – retain `/workspace` in the final image if you need to access additional files (such as sample configs or provider bundles).
 Make sure any custom `build.yaml`, run configs, or provider directories you reference are included in the Docker build context so the Containerfile can read them.
 </TabItem>
 <TabItem value="external" label="Building with external providers">
 External providers live outside the main repository but can be bundled by pointing `external_providers_dir` to a directory that contains your provider packages.
 1. Copy providers into the build context, for example `cp -R path/to/providers providers.d`.
 2. Update `build.yaml` with the directory and provider entries.
 3. Adjust run configs to use the in-container path (usually `/.llama/providers.d`). Pass `--build-arg RUN_CONFIG_PATH=/workspace/run.yaml` if you want to bake the config.
 Example `build.yaml` excerpt for a custom Ollama provider:
 ```yaml
 distribution_spec:
  providers:
    inference:
      - remote::custom_ollama
 external_providers_dir: /workspace/providers.d
 ```
 Inside `providers.d/custom_ollama/provider.py`, define `get_provider_spec()` so the CLI can discover dependencies:
 ```python
 from llama_stack.providers.datatypes import ProviderSpec
 def get_provider_spec() -> ProviderSpec:
    return ProviderSpec(
        provider_type="remote::custom_ollama",
        module="llama_stack_ollama_provider",
        config_class="llama_stack_ollama_provider.config.OllamaImplConfig",
        pip_packages=[
            "ollama",
            "aiohttp",
            "llama-stack-provider-ollama",
        ],
    )
 ```
 Here's an example for a custom Ollama provider:
 ```yaml
 adapter:
  adapter_type: custom_ollama
  pip_packages:
    - ollama
    - aiohttp
    - llama-stack-provider-ollama  # This is the provider package
  config_class: llama_stack_ollama_provider.config.OllamaImplConfig
  module: llama_stack_ollama_provider
 api_dependencies: []
 optional_api_dependencies: []
 ```
 The `pip_packages` section lists the Python packages required by the provider, as well as the
 provider package itself. The package must be available on PyPI or can be provided from a local
 directory or a git repository (git must be installed on the build environment).
 For deeper guidance, see the [External Providers documentation](../providers/external/).
 </TabItem>
 </Tabs>
 ### Run your stack server
 After building the image, launch it directly with Docker or Podman—the entrypoint calls `llama stack run` using the baked distribution or the bundled run config:
 ```bash
 docker run -d \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  -e INFERENCE_MODEL=$INFERENCE_MODEL \
  -e OLLAMA_URL=http://host.docker.internal:11434 \
  llama-stack:starter \
  --port $LLAMA_STACK_PORT
 ```
 Here are the docker flags and their uses:
 * `-d`: Runs the container in the detached mode as a background process
 * `-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT`: Maps the container port to the host port for accessing the server
 * `-v ~/.llama:/root/.llama`: Mounts the local .llama directory to persist configurations and data
 * `localhost/distribution-ollama:dev`: The name and tag of the container image to run
 * `-e INFERENCE_MODEL=$INFERENCE_MODEL`: Sets the INFERENCE_MODEL environment variable in the container
 * `-e OLLAMA_URL=http://host.docker.internal:11434`: Sets the OLLAMA_URL environment variable in the container
 * `--port $LLAMA_STACK_PORT`: Port number for the server to listen on
 If you prepared a custom run config, mount it into the container and reference it explicitly:
 ```bash
 docker run \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v $(pwd)/run.yaml:/app/run.yaml \
  llama-stack:starter \
  /app/run.yaml
 ```
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@ -1,3 +1,9 @@
 ---
 title: Configuring a "Stack"
 description: Configuring a "Stack"
 sidebar_label: Configuring a "Stack"
 sidebar_position: 6
 ---
 # Configuring a "Stack"
 The Llama Stack runtime configuration is specified as a YAML file. Here is a simplified version of an example configuration file for the Ollama distribution:
@ -15,7 +21,6 @@ apis:
 - inference
 - vector_io
 - safety
 - telemetry
 providers:
  inference:
  - provider_id: ollama
@ -38,18 +43,28 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      persistence_store:
+      persistence:
-        type: sqlite
+        agent_state:
-        namespace: null
+          backend: kv_default
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/agents_store.db
+          namespace: agents
-  telemetry:
+        responses:
-  - provider_id: meta-reference
+          backend: sql_default
-    provider_type: inline::meta-reference
+          table_name: responses
-    config: {}
+storage:
-metadata_store:
+  backends:
-  namespace: null
+    kv_default:
-  type: sqlite
+      type: kv_sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/registry.db
+      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/kvstore.db
    sql_default:
      type: sql_sqlite
      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/sqlstore.db
  references:
    metadata:
      backend: kv_default
      namespace: registry
    inference:
      backend: sql_default
      table_name: inference_store
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
@ -72,7 +87,6 @@ apis:
 - inference
 - vector_io
 - safety
 - telemetry
 ```
 ## Providers
@ -95,7 +109,7 @@ A few things to note:
 - The id is a string you can choose freely.
 - You can instantiate any number of provider instances of the same type.
 - The configuration dictionary is provider-specific.
- Notice that configuration can reference environment variables (with default values), which are expanded at runtime. When you run a stack server (via docker or via `llama stack run`), you can specify `--env OLLAMA_URL=http://my-server:11434` to override the default value.
+- Notice that configuration can reference environment variables (with default values), which are expanded at runtime. When you run a stack server, you can set environment variables in your shell before running `llama stack run` to override the default values.
 ### Environment Variable Substitution
@ -167,13 +181,10 @@ optional_token: ${env.OPTIONAL_TOKEN:+}
 #### Runtime Override
-You can override environment variables at runtime when starting the server:
+You can override environment variables at runtime by setting them in your shell before starting the server:
 ```bash
-# Override specific environment variables
+# Set environment variables in your shell
 llama stack run --config run.yaml --env API_KEY=sk-123 --env BASE_URL=https://custom-api.com
 # Or set them in your shell
 export API_KEY=sk-123
 export BASE_URL=https://custom-api.com
 llama stack run --config run.yaml
@ -200,7 +211,7 @@ models:
  provider_model_id: null
  model_type: llm
 ```
-A Model is an instance of a "Resource" (see [Concepts](../concepts/index)) and is associated with a specific inference provider (in this case, the provider with identifier `ollama`). This is an instance of a "pre-registered" model. While we always encourage the clients to register models before using them, some Stack servers may come up a list of "already known and available" models.
+A Model is an instance of a "Resource" (see [Concepts](../concepts/)) and is associated with a specific inference provider (in this case, the provider with identifier `ollama`). This is an instance of a "pre-registered" model. While we always encourage the clients to register models before using them, some Stack servers may come up a list of "already known and available" models.
 What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. Contrast it with `model_id` which is the identifier for the same model for Llama Stack's purposes. For example, you may want to name "llama3.2:vision-11b" as "image_captioning_model" when you use it in your Stack interactions. When omitted, the server will set `provider_model_id` to be the same as `model_id`.
@ -472,12 +483,12 @@ A rule may also specify a condition, either a 'when' or an 'unless',
 with additional constraints as to where the rule applies. The
 constraints supported at present are:
- - 'user with <attr-value> in <attr-name>'
+ - 'user with `<attr-value>` in `<attr-name>`'
- - 'user with <attr-value> not in <attr-name>'
+ - 'user with `<attr-value>` not in `<attr-name>`'
 - 'user is owner'
 - 'user is not owner'
- - 'user in owners <attr-name>'
+ - 'user in owners `<attr-name>`'
- - 'user not in owners <attr-name>'
+ - 'user not in owners `<attr-name>`'
 The attributes defined for a user will depend on how the auth
 configuration is defined.
@ -572,24 +583,13 @@ created by users sharing a team with them:
 In addition to resource-based access control, Llama Stack supports endpoint-level authorization using OAuth 2.0 style scopes. When authentication is enabled, specific API endpoints require users to have particular scopes in their authentication token.
 **Scope-Gated APIs:**
 The following APIs are currently gated by scopes:
 - **Telemetry API** (scope: `telemetry.read`):
  - `POST /telemetry/traces` - Query traces
  - `GET /telemetry/traces/{trace_id}` - Get trace by ID
  - `GET /telemetry/traces/{trace_id}/spans/{span_id}` - Get span by ID
  - `POST /telemetry/spans/{span_id}/tree` - Get span tree
  - `POST /telemetry/spans` - Query spans
  - `POST /telemetry/metrics/{metric_name}` - Query metrics
 **Authentication Configuration:**
 For **JWT/OAuth2 providers**, scopes should be included in the JWT's claims:
 ```json
 {
  "sub": "user123",
-  "scope": "telemetry.read",
+  "scope": "<scope>",
  "aud": "llama-stack"
 }
 ```
@ -599,7 +599,7 @@ For **custom authentication providers**, the endpoint must return user attribute
 {
  "principal": "user123",
  "attributes": {
-    "scopes": ["telemetry.read"]
+    "scopes": ["<scope>"]
  }
 }
 ```
--- a/docs/source/distributions/customizing_run_yaml.md
+++ b/docs/source/distributions/customizing_run_yaml.md
@ -1,3 +1,9 @@
 ---
 title: Customizing run.yaml
 description: Customizing run.yaml files for Llama Stack templates
 sidebar_label: Customizing run.yaml
 sidebar_position: 4
 ---
 # Customizing run.yaml Files
 The `run.yaml` files generated by Llama Stack templates are **starting points** designed to be customized for your specific needs. They are not meant to be used as-is in production environments.
--- a/docs/source/distributions/eks/apply.sh
+++ b/docs/source/distributions/eks/apply.sh
--- a/docs/source/distributions/eks/gp3-topology-aware.yaml
+++ b/docs/source/distributions/eks/gp3-topology-aware.yaml
--- a/docs/source/distributions/importing_as_library.md
+++ b/docs/source/distributions/importing_as_library.md
@ -1,3 +1,9 @@
 ---
 title: Using Llama Stack as a Library
 description: How to use Llama Stack as a Python library instead of running a server
 sidebar_label: Importing as Library
 sidebar_position: 5
 ---
 # Using Llama Stack as a Library
 ## Setup Llama Stack without a Server
@ -6,7 +12,7 @@ This avoids the overhead of setting up a server.
 ```bash
 # setup
 uv pip install llama-stack
-llama stack build --distro starter --image-type venv
+llama stack list-deps starter | xargs -L1 uv pip install
 ```
 ```python
@ -27,7 +33,7 @@ Then, you can access the APIs like `models` and `inference` on the client and ca
 response = client.models.list()
 ```
-If you've created a [custom distribution](building_distro.md), you can also use the run.yaml configuration file directly:
+If you've created a [custom distribution](./building_distro), you can also use the run.yaml configuration file directly:
 ```python
 client = LlamaStackAsLibraryClient(config_path)
--- a/docs/docs/distributions/index.mdx
+++ b/docs/docs/distributions/index.mdx
@ -0,0 +1,21 @@
 ---
 title: Distributions Overview
 description: Pre-packaged sets of Llama Stack components for different deployment scenarios
 sidebar_label: Overview
 sidebar_position: 1
 ---
 # Distributions Overview
 A distribution is a pre-packaged set of Llama Stack components that can be deployed together.
 This section provides an overview of the distributions available in Llama Stack.
 ## Distribution Guides
 - **[Available Distributions](./list_of_distributions.mdx)** - Complete list and comparison of all distributions
 - **[Building Custom Distributions](./building_distro.mdx)** - Create your own distribution from scratch
 - **[Customizing Configuration](./customizing_run_yaml.mdx)** - Customize run.yaml for your needs
 - **[Starting Llama Stack Server](./starting_llama_stack_server.mdx)** - How to run distributions
 - **[Importing as Library](./importing_as_library.mdx)** - Use distributions in your code
 - **[Configuration Reference](./configuration.mdx)** - Configuration file format details
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
--- a/docs/source/distributions/k8s/chroma-k8s.yaml.template
+++ b/docs/source/distributions/k8s/chroma-k8s.yaml.template
--- a/docs/source/distributions/k8s/hf-token-secret.yaml.template
+++ b/docs/source/distributions/k8s/hf-token-secret.yaml.template
--- a/docs/source/distributions/k8s/ingress-k8s.yaml.template
+++ b/docs/source/distributions/k8s/ingress-k8s.yaml.template
--- a/docs/source/distributions/k8s/postgres-k8s.yaml.template
+++ b/docs/source/distributions/k8s/postgres-k8s.yaml.template
--- a/docs/docs/distributions/k8s/stack-configmap.yaml
+++ b/docs/docs/distributions/k8s/stack-configmap.yaml
@ -0,0 +1,155 @@
 apiVersion: v1
 data:
  stack_run_config.yaml: |
    version: '2'
    image_name: kubernetes-demo
    apis:
    - agents
    - inference
    - files
    - safety
    - telemetry
    - tool_runtime
    - vector_io
    providers:
      inference:
      - provider_id: vllm-inference
        provider_type: remote::vllm
        config:
          url: ${env.VLLM_URL:=http://localhost:8000/v1}
          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
          api_token: ${env.VLLM_API_TOKEN:=fake}
          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
      - provider_id: vllm-safety
        provider_type: remote::vllm
        config:
          url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
          api_token: ${env.VLLM_API_TOKEN:=fake}
          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
      - provider_id: sentence-transformers
        provider_type: inline::sentence-transformers
        config: {}
      vector_io:
      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
        provider_type: remote::chromadb
        config:
          url: ${env.CHROMADB_URL:=}
          kvstore:
            type: postgres
            host: ${env.POSTGRES_HOST:=localhost}
            port: ${env.POSTGRES_PORT:=5432}
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
      files:
      - provider_id: meta-reference-files
        provider_type: inline::localfs
        config:
          storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
          metadata_store:
            type: sqlite
            db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
      safety:
      - provider_id: llama-guard
        provider_type: inline::llama-guard
        config:
          excluded_categories: []
      agents:
      - provider_id: meta-reference
        provider_type: inline::meta-reference
        config:
          persistence_store:
            type: postgres
            host: ${env.POSTGRES_HOST:=localhost}
            port: ${env.POSTGRES_PORT:=5432}
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
          responses_store:
            type: postgres
            host: ${env.POSTGRES_HOST:=localhost}
            port: ${env.POSTGRES_PORT:=5432}
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
      telemetry:
      - provider_id: meta-reference
        provider_type: inline::meta-reference
        config:
          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
          sinks: ${env.TELEMETRY_SINKS:=console}
      tool_runtime:
      - provider_id: brave-search
        provider_type: remote::brave-search
        config:
          api_key: ${env.BRAVE_SEARCH_API_KEY:+}
          max_results: 3
      - provider_id: tavily-search
        provider_type: remote::tavily-search
        config:
          api_key: ${env.TAVILY_SEARCH_API_KEY:+}
          max_results: 3
      - provider_id: rag-runtime
        provider_type: inline::rag-runtime
        config: {}
      - provider_id: model-context-protocol
        provider_type: remote::model-context-protocol
        config: {}
    storage:
      backends:
        kv_default:
          type: kv_postgres
          host: ${env.POSTGRES_HOST:=localhost}
          port: ${env.POSTGRES_PORT:=5432}
          db: ${env.POSTGRES_DB:=llamastack}
          user: ${env.POSTGRES_USER:=llamastack}
          password: ${env.POSTGRES_PASSWORD:=llamastack}
          table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
        sql_default:
          type: sql_postgres
          host: ${env.POSTGRES_HOST:=localhost}
          port: ${env.POSTGRES_PORT:=5432}
          db: ${env.POSTGRES_DB:=llamastack}
          user: ${env.POSTGRES_USER:=llamastack}
          password: ${env.POSTGRES_PASSWORD:=llamastack}
      references:
        metadata:
          backend: kv_default
          namespace: registry
        inference:
          backend: sql_default
          table_name: inference_store
    models:
    - metadata:
        embedding_dimension: 768
      model_id: nomic-embed-text-v1.5
      provider_id: sentence-transformers
      model_type: embedding
    - metadata: {}
      model_id: ${env.INFERENCE_MODEL}
      provider_id: vllm-inference
      model_type: llm
    - metadata: {}
      model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
      provider_id: vllm-safety
      model_type: llm
    shields:
    - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
    vector_dbs: []
    datasets: []
    scoring_fns: []
    benchmarks: []
    tool_groups:
    - toolgroup_id: builtin::websearch
      provider_id: tavily-search
    - toolgroup_id: builtin::rag
      provider_id: rag-runtime
    server:
      port: 8321
      auth:
        provider_config:
          type: github_token
 kind: ConfigMap
 metadata:
  name: llama-stack-config
--- a/Show more
+++ b/Show more
		`@ -0,0 +1 @@`
							`tests//recordings/ linguist-generated=true`
`@ -1,2 +1 @@`
	`# This file documents Triage members in the Llama Stack community`	`# This file documents Triage members in the Llama Stack community`
	`@franciscojavierarceo`