Merge branch 'llamastack:main' into crewai

2025-12-16 10:29:27 +00:00 · 2025-09-29 12:06:53 -07:00 · 2025-09-29 12:06:53 -07:00 · ac1af2d96a
commit ac1af2d96a
parent 4375764074 975ead1d6a
581 changed files with 148868 additions and 11347 deletions
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -2,10 +2,10 @@ blank_issues_enabled: false

 contact_links:
  - name: Have you read the docs?
-    url: https://llama-stack.readthedocs.io/en/latest/index.html
+    url: https://llamastack.github.io/latest/providers/external/index.html
    about: Much help can be found in the docs
  - name: Start a discussion
-    url: https://github.com/meta-llama/llama-stack/discussions/new
+    url: https://github.com/llamastack/llama-stack/discussions/new/
    about: Start a discussion on a topic
  - name: Chat on Discord
    url: https://discord.gg/llama-stack
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -21,4 +21,3 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |
 | UI Tests | [ui-unit-tests.yml](ui-unit-tests.yml) | Run the UI test suite |
 | Unit Tests | [unit-tests.yml](unit-tests.yml) | Run the unit test suite |
-| Update ReadTheDocs | [update-readthedocs.yml](update-readthedocs.yml) | Update the Llama Stack ReadTheDocs site |
--- a/.github/workflows/conformance.yml
+++ b/.github/workflows/conformance.yml
@ -13,11 +13,8 @@ on:
    branches: [ main ]
    types: [opened, synchronize, reopened]
    paths:
-      - 'llama_stack/**'
-      - '!llama_stack/ui/**'
-      - 'tests/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
+      - 'docs/static/llama-stack-spec.yaml'
+      - 'docs/static/llama-stack-spec.html'
      - '.github/workflows/conformance.yml' # This workflow itself

 concurrency:
@ -43,15 +40,32 @@ jobs:
          ref: ${{ github.event.pull_request.base.ref }}
          path: 'base'

+      # Cache oasdiff to avoid checksum failures and speed up builds
+      - name: Cache oasdiff
+        id: cache-oasdiff
+        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830
+        with:
+          path: ~/oasdiff
+          key: oasdiff-${{ runner.os }}
+
      # Install oasdiff: https://github.com/oasdiff/oasdiff, a tool for detecting breaking changes in OpenAPI specs.
      - name: Install oasdiff
+        if: steps.cache-oasdiff.outputs.cache-hit != 'true'
        run: |
          curl -fsSL https://raw.githubusercontent.com/oasdiff/oasdiff/main/install.sh | sh
+          cp /usr/local/bin/oasdiff ~/oasdiff
+
+      # Setup cached oasdiff
+      - name: Setup cached oasdiff
+        if: steps.cache-oasdiff.outputs.cache-hit == 'true'
+        run: |
+          sudo cp ~/oasdiff /usr/local/bin/oasdiff
+          sudo chmod +x /usr/local/bin/oasdiff

      # Run oasdiff to detect breaking changes in the API specification
      # This step will fail if incompatible changes are detected, preventing breaking changes from being merged
      - name: Run OpenAPI Breaking Change Diff
        run: |
-          oasdiff breaking --fail-on ERR base/docs/_static/llama-stack-spec.yaml docs/_static/llama-stack-spec.yaml --match-path '^/v1/openai/v1' \
+          oasdiff breaking --fail-on ERR base/docs/static/llama-stack-spec.yaml docs/static/llama-stack-spec.yaml --match-path '^/v1/openai/v1' \
          --match-path '^/v1/vector-io' \
          --match-path '^/v1/vector-dbs'
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -47,11 +47,21 @@ jobs:
        run: npm ci
        working-directory: llama_stack/ui

-      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+      - name: Run pre-commit
+        id: precommit
+        uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+        continue-on-error: true
        env:
          SKIP: no-commit-to-branch
          RUFF_OUTPUT_FORMAT: github

+      - name: Check pre-commit results
+        if: steps.precommit.outcome == 'failure'
+        run: |
+          echo "::error::Pre-commit hooks failed. Please run 'pre-commit run --all-files' locally and commit the fixes."
+          echo "::warning::Some pre-commit hooks failed. Check the output above for details."
+          exit 1
+
      - name: Debug
        run: |
          echo "github.ref: ${{ github.ref }}"
@ -79,17 +89,23 @@ jobs:
            echo "No changes to commit"
          fi

-      - name: Verify if there are any diff files after pre-commit
+      - name: Verify no uncommitted changes
        if: github.actor != 'dependabot[bot]'
        run: |
-          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
+          if ! git diff --exit-code; then
+            echo "::error::There are uncommitted changes after pre-commit. Please run 'pre-commit run --all-files' locally and commit the fixes."
+            echo "::warning::Files with changes:"
+            git diff --name-status
+            exit 1
+          fi

      - name: Verify if there are any new files after pre-commit
        if: github.actor != 'dependabot[bot]'
        run: |
          unstaged_files=$(git ls-files --others --exclude-standard)
          if [ -n "$unstaged_files" ]; then
-            echo "There are uncommitted new files, run pre-commit locally and commit again"
+            echo "::error::There are new untracked files after pre-commit. Please run 'pre-commit run --all-files' locally and commit the fixes."
+            echo "::warning::New files:"
            echo "$unstaged_files"
            exit 1
          fi
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -24,7 +24,7 @@ jobs:
      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0

    - name: Install uv
-      uses: astral-sh/setup-uv@557e51de59eb14aaaba2ed9621916900a91d50c6 # v6.6.1
+      uses: astral-sh/setup-uv@b75a909f75acd358c2196fb9a5f1299a9a8868a4 # v6.7.0
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
--- a/.github/workflows/update-readthedocs.yml
+++ b/.github/workflows/update-readthedocs.yml
@ -1,70 +0,0 @@
-name: Update ReadTheDocs
-
-run-name: Update the Llama Stack ReadTheDocs site
-
-on:
-  workflow_dispatch:
-    inputs:
-      branch:
-        description: 'RTD version to update'
-        required: false
-        default: 'latest'
-  push:
-    branches:
-      - main
-    paths:
-      - 'docs/**'
-      - 'pyproject.toml'
-      - '.github/workflows/update-readthedocs.yml'
-    tags:
-      - '*'
-  pull_request:
-    branches:
-      - main
-    paths:
-      - 'docs/**'
-      - 'pyproject.toml'
-      - '.github/workflows/update-readthedocs.yml'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  update-readthedocs:
-    runs-on: ubuntu-latest
-    env:
-      TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Build HTML
-        run: |
-          cd docs
-          uv run make html
-
-      - name: Trigger ReadTheDocs build
-        if: github.event_name != 'pull_request'
-        run: |
-          if [ -z "$TOKEN" ]; then
-            echo "READTHEDOCS_TOKEN is not set"
-            exit 1
-          fi
-
-          response=$(curl -X POST \
-            -H "Content-Type: application/json" \
-            -d "{
-              \"token\": \"$TOKEN\",
-              \"version\": \"$GITHUB_REF_NAME\"
-            }" \
-            https://readthedocs.org/api/v2/webhook/llama-stack/289768/)
-
-          echo "Response: $response"
-          if [ $(echo $response | jq -r '.build_triggered') != 'true' ]; then
-            echo "Failed to trigger ReadTheDocs build"
-            exit 1
-          fi
--- a/.gitignore
+++ b/.gitignore
@ -18,7 +18,6 @@ Package.resolved
 .venv/
 .vscode
 _build
-docs/src
 # Sample tool-calling datasets generated by NVIDIA notebooks
 docs/notebooks/nvidia/tool_calling/sample_data/
 pyrightconfig.json
@ -30,3 +29,5 @@ AGENTS.md
 server.log
 CLAUDE.md
 .claude/
+docs/.docusaurus/
+docs/node_modules/
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -1,25 +0,0 @@
-# .readthedocs.yaml
-# Read the Docs configuration file
-# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
-
-# Required
-version: 2
-
-# Build documentation in the "docs/" directory with Sphinx
-sphinx:
-  configuration: docs/source/conf.py
-
-# Set the OS, Python version and other tools you might need
-build:
-  os: ubuntu-22.04
-  tools:
-    python: "3.12"
-  jobs:
-    pre_create_environment:
-      - asdf plugin add uv
-      - asdf install uv latest
-      - asdf global uv latest
-    create_environment:
-      - uv venv "${READTHEDOCS_VIRTUALENV_PATH}"
-    install:
-      - UV_PROJECT_ENVIRONMENT="${READTHEDOCS_VIRTUALENV_PATH}" uv sync --frozen --group docs
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -187,14 +187,16 @@ Note that the provider "description" field will be used to generate the provider

 ### Building the Documentation

-If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.
+If you are making changes to the documentation at [https://llamastack.github.io/](https://llamastack.github.io/), you can use the following command to build the documentation and preview your changes.

 ```bash
-# This rebuilds the documentation pages.
-uv run --group docs make -C docs/ html
+# This rebuilds the documentation pages and the OpenAPI spec.
+npm install
+npm run gen-api-docs all
+npm run build

-# This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
-uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
+# This will start a local server (usually at http://127.0.0.1:3000).
+npm run serve
 ```

 ### Update API Documentation
@ -205,4 +207,4 @@ If you modify or add new API endpoints, update the API documentation accordingly
 uv run ./docs/openapi_generator/run_openapi_generator.sh
 ```

-The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.
+The generated API schema will be available in `docs/static/`. Make sure to review the changes before committing.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -4,6 +4,8 @@ include llama_stack/models/llama/llama4/tokenizer.model
 include llama_stack/core/*.sh
 include llama_stack/cli/scripts/*.sh
 include llama_stack/distributions/*/*.yaml
-include llama_stack/providers/tests/test_cases/inference/*.json
+exclude llama_stack/distributions/ci-tests
+include tests/integration/test_cases/inference/*.json
 include llama_stack/models/llama/*/*.md
 include llama_stack/tests/integration/*.jpg
+prune llama_stack/distributions/ci-tests
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@
 [![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
 [![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)

-[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
+[**Quick Start**](https://llamastack.github.io/latest/getting_started/index.html) | [**Documentation**](https://llamastack.github.io/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)


 ### ✨🎉 Llama 4 Support  🎉✨
@ -109,7 +109,7 @@ By reducing friction and complexity, Llama Stack empowers developers to focus on

 ### API Providers
 Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack.
-Please checkout for [full list](https://llama-stack.readthedocs.io/en/latest/providers/index.html)
+Please checkout for [full list](https://llamastack.github.io/latest/providers/index.html)

 | API Provider Builder | Environments | Agents | Inference | VectorIO | Safety | Telemetry | Post Training | Eval | DatasetIO |
 |:--------------------:|:------------:|:------:|:---------:|:--------:|:------:|:---------:|:-------------:|:----:|:--------:|
@ -140,7 +140,7 @@ Please checkout for [full list](https://llama-stack.readthedocs.io/en/latest/pro
 |     NVIDIA NEMO      | Hosted | | ✅ | ✅ | | | ✅ | ✅ | ✅ |
 |        NVIDIA        | Hosted | | | | | | ✅ | ✅ | ✅ |

-> **Note**: Additional providers are available through external packages. See [External Providers](https://llama-stack.readthedocs.io/en/latest/providers/external.html) documentation.
+> **Note**: Additional providers are available through external packages. See [External Providers](https://llamastack.github.io/latest/providers/external/index.html) documentation.

 ### Distributions

@ -149,24 +149,24 @@ Here are some of the distributions we support:

 |               **Distribution**                |                                                                    **Llama Stack Docker**                                                                     |                                                 Start This Distribution                                                  |
 |:---------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------:|
-|                Starter Distribution                 |           [llamastack/distribution-starter](https://hub.docker.com/repository/docker/llamastack/distribution-starter/general)           |      [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/starter.html)      |
-|                Meta Reference                 |           [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general)           |      [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html)      |
+|                Starter Distribution                 |           [llamastack/distribution-starter](https://hub.docker.com/repository/docker/llamastack/distribution-starter/general)           |      [Guide](https://llamastack.github.io/latest/distributions/self_hosted_distro/starter.html)      |
+|                Meta Reference                 |           [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general)           |      [Guide](https://llamastack.github.io/latest/distributions/self_hosted_distro/meta-reference-gpu.html)      |
 |                   PostgreSQL                  |                [llamastack/distribution-postgres-demo](https://hub.docker.com/repository/docker/llamastack/distribution-postgres-demo/general)                |                  |

 ### Documentation

-Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest/index.html) page for more details.
+Please checkout our [Documentation](https://llamastack.github.io/latest/index.html) page for more details.

 * CLI references
-    * [llama (server-side) CLI Reference](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/index.html): Guide for using the `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
-    * [llama (client-side) CLI Reference](https://llama-stack.readthedocs.io/en/latest/references/llama_stack_client_cli_reference.html): Guide for using the `llama-stack-client` CLI, which allows you to query information about the distribution.
+    * [llama (server-side) CLI Reference](https://llamastack.github.io/latest/references/llama_cli_reference/index.html): Guide for using the `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
+    * [llama (client-side) CLI Reference](https://llamastack.github.io/latest/references/llama_stack_client_cli_reference.html): Guide for using the `llama-stack-client` CLI, which allows you to query information about the distribution.
 * Getting Started
-    * [Quick guide to start a Llama Stack server](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).
+    * [Quick guide to start a Llama Stack server](https://llamastack.github.io/latest/getting_started/index.html).
    * [Jupyter notebook](./docs/getting_started.ipynb) to walk-through how to use simple text and vision inference llama_stack_client APIs
    * The complete Llama Stack lesson [Colab notebook](https://colab.research.google.com/drive/1dtVmxotBsI4cGZQNsJRYPrLiDeT0Wnwt) of the new [Llama 3.2 course on Deeplearning.ai](https://learn.deeplearning.ai/courses/introducing-multimodal-llama-3-2/lesson/8/llama-stack).
    * A [Zero-to-Hero Guide](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) that guide you through all the key components of llama stack with code samples.
 * [Contributing](CONTRIBUTING.md)
-    * [Adding a new API Provider](https://llama-stack.readthedocs.io/en/latest/contributing/new_api_provider.html) to walk-through how to add a new API provider.
+    * [Adding a new API Provider](https://llamastack.github.io/latest/contributing/new_api_provider.html) to walk-through how to add a new API provider.

 ### Llama Stack Client SDKs

@ -193,4 +193,4 @@ Thanks to all of our amazing contributors!

 <a href="https://github.com/meta-llama/llama-stack/graphs/contributors">
  <img src="https://contrib.rocks/image?repo=meta-llama/llama-stack" />
-</a>
+</a>
--- a/benchmarking/k8s-benchmark/README.md
+++ b/benchmarking/k8s-benchmark/README.md
@ -26,6 +26,7 @@ The benchmark suite measures critical performance indicators:
 - **Throughput**: Requests per second under sustained load
 - **Latency Distribution**: P50, P95, P99 response times
 - **Time to First Token (TTFT)**: Critical for streaming applications
+- **Inter-Token Latency (ITL)**: Token generation speed for streaming
 - **Error Rates**: Request failures and timeout analysis

 This data enables data-driven architectural decisions and performance optimization efforts.
@ -49,49 +50,148 @@ kubectl get pods
 # Should see: llama-stack-benchmark-server, vllm-server, etc.
 ```

+## Benchmark Results
+
+We use [GuideLLM](https://github.com/neuralmagic/guidellm) against our k8s deployment for comprehensive performance testing.
+
+
+### Performance - 1 vLLM Replica
+
+We vary the number of Llama Stack replicas with 1 vLLM replica and compare performance below.
+
+![Performance - 1 vLLM Replica](results/vllm_replica1_benchmark_results.png)
+
+
+For full results see the `benchmarking/k8s-benchmark/results/` directory.
+
+
 ## Quick Start

-### Basic Benchmarks
+Follow the instructions below to run benchmarks similar to the ones above.

-**Benchmark Llama Stack (default):**
+### Comprehensive Benchmark Suite
+
+**Run all benchmarks with different cluster configurations:**
 ```bash
-./run-benchmark.sh
+./scripts/run-all-benchmarks.sh
 ```

-**Benchmark vLLM direct:**
+This script will automatically:
+- Scale deployments to different configurations
+- Run benchmarks for each setup
+- Generate output files with meaningful names that include setup information
+
+### Individual Benchmarks
+
+**Benchmark Llama Stack (runs against current cluster setup):**
 ```bash
-./run-benchmark.sh --target vllm
+./scripts/run-guidellm-benchmark.sh --target stack
 ```

-### Custom Configuration
-
-**Extended benchmark with high concurrency:**
+**Benchmark vLLM direct (runs against current cluster setup):**
 ```bash
-./run-benchmark.sh --target vllm --duration 120 --concurrent 20
+./scripts/run-guidellm-benchmark.sh --target vllm
 ```

-**Short test run:**
+**Benchmark with custom parameters:**
 ```bash
-./run-benchmark.sh --target stack --duration 30 --concurrent 5
+./scripts/run-guidellm-benchmark.sh --target stack --max-seconds 120 --prompt-tokens 1024 --output-tokens 512
 ```

+**Benchmark with custom output file:**
+```bash
+./scripts/run-guidellm-benchmark.sh --target stack --output-file results/my-custom-benchmark.txt
+```
+
+### Generating Charts
+
+Once the benchmarks are run, you can generate performance charts from benchmark results:
+
+```bash
+uv run ./scripts/generate_charts.py
+```
+
+This loads runs in the `results/` directory and creates visualizations comparing different configurations and replica counts.
+
+## Benchmark Workflow
+
+The benchmark suite is organized into two main scripts with distinct responsibilities:
+
+### 1. `run-all-benchmarks.sh` - Orchestration & Scaling
+- **Purpose**: Manages different cluster configurations and orchestrates benchmark runs
+- **Responsibilities**:
+  - Scales Kubernetes deployments (vLLM replicas, Stack replicas, worker counts)
+  - Runs benchmarks for each configuration
+  - Generates meaningful output filenames with setup information
+- **Use case**: Running comprehensive performance testing across multiple configurations
+
+### 2. `run-guidellm-benchmark.sh` - Single Benchmark Execution
+- **Purpose**: Executes a single benchmark against the current cluster state
+- **Responsibilities**:
+  - Runs GuideLLM benchmark with configurable parameters
+  - Accepts custom output file paths
+  - No cluster scaling - benchmarks current deployment state
+- **Use case**: Testing specific configurations or custom scenarios
+
+### Typical Workflow
+1. **Comprehensive Testing**: Use `run-all-benchmarks.sh` to automatically test multiple configurations
+2. **Custom Testing**: Use `run-guidellm-benchmark.sh` for specific parameter testing or manual cluster configurations
+3. **Analysis**: Use `generate_charts.py` to visualize results from either approach
+
 ## Command Reference

-### run-benchmark.sh Options
+### run-all-benchmarks.sh
+
+Orchestrates multiple benchmark runs with different cluster configurations. This script:
+- Automatically scales deployments before each benchmark
+- Runs benchmarks against the configured cluster setup
+- Generates meaningfully named output files

 ```bash
-./run-benchmark.sh [options]
+./scripts/run-all-benchmarks.sh
+```
+
+**Configuration**: Edit the `configs` array in the script to customize benchmark configurations:
+```bash
+# Each line: (target, stack_replicas, vllm_replicas, stack_workers)
+configs=(
+    "stack 1 1 1"
+    "stack 1 1 2"
+    "stack 1 1 4"
+    "vllm 1 1 -"
+)
+```
+
+**Output files**: Generated with setup information in filename:
+- Stack: `guidellm-benchmark-stack-s{replicas}-sw{workers}-v{vllm_replicas}-{timestamp}.txt`
+- vLLM: `guidellm-benchmark-vllm-v{vllm_replicas}-{timestamp}.txt`
+
+### run-guidellm-benchmark.sh Options
+
+Runs a single benchmark against the current cluster setup (no scaling).
+
+```bash
+./scripts/run-guidellm-benchmark.sh [options]

 Options:
  -t, --target <stack|vllm>     Target to benchmark (default: stack)
-  -d, --duration <seconds>      Duration in seconds (default: 60)
-  -c, --concurrent <users>      Number of concurrent users (default: 10)
+  -s, --max-seconds <seconds>   Maximum duration in seconds (default: 60)
+  -p, --prompt-tokens <tokens>  Number of prompt tokens (default: 512)
+  -o, --output-tokens <tokens>  Number of output tokens (default: 256)
+  -r, --rate-type <type>        Rate type (default: concurrent)
+  -c, --rate                    Rate (default: 1,2,4,8,16,32,64,128)
+  --output-file <path>          Output file path (default: auto-generated)
+  --stack-deployment <name>     Name of the stack deployment (default: llama-stack-benchmark-server)
+  --vllm-deployment <name>      Name of the vllm deployment (default: vllm-server)
+  --stack-url <url>             URL of the stack service (default: http://llama-stack-benchmark-service:8323/v1/openai)
  -h, --help                    Show help message

 Examples:
-  ./run-benchmark.sh --target vllm              # Benchmark vLLM direct
-  ./run-benchmark.sh --target stack             # Benchmark Llama Stack
-  ./run-benchmark.sh -t vllm -d 120 -c 20       # vLLM with 120s, 20 users
+  ./scripts/run-guidellm-benchmark.sh --target vllm                              # Benchmark vLLM direct
+  ./scripts/run-guidellm-benchmark.sh --target stack                             # Benchmark Llama Stack (default)
+  ./scripts/run-guidellm-benchmark.sh -t vllm -s 60 -p 512 -o 256               # vLLM with custom parameters
+  ./scripts/run-guidellm-benchmark.sh --output-file results/my-benchmark.txt     # Specify custom output file
+  ./scripts/run-guidellm-benchmark.sh --stack-deployment my-stack-server         # Use custom stack deployment name
 ```

 ## Local Testing
@ -100,55 +200,30 @@ Examples:

 For local development without Kubernetes:

-**1. Start OpenAI mock server:**
-```bash
-uv run python openai-mock-server.py --port 8080
-```
-
-**2. Run benchmark against mock server:**
-```bash
-uv run python benchmark.py \
-  --base-url http://localhost:8080/v1 \
-  --model mock-inference \
-  --duration 30 \
-  --concurrent 5
-```
-
-**3. Test against local vLLM server:**
-```bash
-# If you have vLLM running locally on port 8000
-uv run python benchmark.py \
-  --base-url http://localhost:8000/v1 \
-  --model meta-llama/Llama-3.2-3B-Instruct \
-  --duration 30 \
-  --concurrent 5
-```
-
-**4. Profile the running server:**
-```bash
-./profile_running_server.sh
-```
-
-
-
-### OpenAI Mock Server
+**1. (Optional) Start Mock OpenAI server:**

+There is a simple mock OpenAI server if you don't have an inference provider available.
 The `openai-mock-server.py` provides:
 - **OpenAI-compatible API** for testing without real models
 - **Configurable streaming delay** via `STREAM_DELAY_SECONDS` env var
 - **Consistent responses** for reproducible benchmarks
 - **Lightweight testing** without GPU requirements

-**Mock server usage:**
 ```bash
 uv run python openai-mock-server.py --port 8080
 ```

-The mock server is also deployed in k8s as `openai-mock-service:8080` and can be used by changing the Llama Stack configuration to use the `mock-vllm-inference` provider.
+**2. Start Stack server:**
+```bash
+LLAMA_STACK_CONFIG=benchmarking/k8s-benchmark/stack_run_config.yaml uv run uvicorn llama_stack.core.server.server:create_app --port 8321 --workers 4 --factory
+```

-## Files in this Directory
-
- `benchmark.py` - Core benchmark script with async streaming support
- `run-benchmark.sh` - Main script with target selection and configuration
- `openai-mock-server.py` - Mock OpenAI API server for local testing
- `README.md` - This documentation file
+**3. Run GuideLLM benchmark:**
+```bash
+GUIDELLM__PREFERRED_ROUTE="chat_completions" uv run guidellm benchmark run \
+  --target "http://localhost:8321/v1/openai/v1" \
+  --model "meta-llama/Llama-3.2-3B-Instruct" \
+  --rate-type sweep \
+  --max-seconds 60 \
+  --data "prompt_tokens=256,output_tokens=128" --output-path='output.html'
+```
--- a/benchmarking/k8s-benchmark/apply.sh
+++ b/benchmarking/k8s-benchmark/apply.sh
@ -17,11 +17,8 @@ export POSTGRES_PASSWORD=llamastack
 export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B

-export MOCK_INFERENCE_MODEL=mock-inference
-
-export MOCK_INFERENCE_URL=openai-mock-service:8080
-
 export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
+export LLAMA_STACK_WORKERS=4

 set -euo pipefail
 set -x
--- a/benchmarking/k8s-benchmark/benchmark.py
+++ b/benchmarking/k8s-benchmark/benchmark.py
@ -1,265 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""
-Simple benchmark script for Llama Stack with OpenAI API compatibility.
-"""
-
-import argparse
-import asyncio
-import os
-import random
-import statistics
-import time
-
-import aiohttp
-
-
-class BenchmarkStats:
-    def __init__(self):
-        self.response_times = []
-        self.ttft_times = []
-        self.chunks_received = []
-        self.errors = []
-        self.success_count = 0
-        self.total_requests = 0
-        self.concurrent_users = 0
-        self.start_time = None
-        self.end_time = None
-        self._lock = asyncio.Lock()
-
-    async def add_result(self, response_time: float, chunks: int, ttft: float = None, error: str = None):
-        async with self._lock:
-            self.total_requests += 1
-            if error:
-                self.errors.append(error)
-            else:
-                self.success_count += 1
-                self.response_times.append(response_time)
-                self.chunks_received.append(chunks)
-                if ttft is not None:
-                    self.ttft_times.append(ttft)
-
-    def print_summary(self):
-        if not self.response_times:
-            print("No successful requests to report")
-            if self.errors:
-                print(f"Total errors: {len(self.errors)}")
-                print("First 5 errors:")
-                for error in self.errors[:5]:
-                    print(f"  {error}")
-            return
-
-        total_time = self.end_time - self.start_time
-        success_rate = (self.success_count / self.total_requests) * 100
-
-        print(f"\n{'=' * 60}")
-        print("BENCHMARK RESULTS")
-
-        print("\nResponse Time Statistics:")
-        print(f"  Mean: {statistics.mean(self.response_times):.3f}s")
-        print(f"  Median: {statistics.median(self.response_times):.3f}s")
-        print(f"  Min: {min(self.response_times):.3f}s")
-        print(f"  Max: {max(self.response_times):.3f}s")
-
-        if len(self.response_times) > 1:
-            print(f"  Std Dev: {statistics.stdev(self.response_times):.3f}s")
-
-        percentiles = [50, 90, 95, 99]
-        sorted_times = sorted(self.response_times)
-        print("\nPercentiles:")
-        for p in percentiles:
-            idx = int(len(sorted_times) * p / 100) - 1
-            idx = max(0, min(idx, len(sorted_times) - 1))
-            print(f"  P{p}: {sorted_times[idx]:.3f}s")
-
-        if self.ttft_times:
-            print("\nTime to First Token (TTFT) Statistics:")
-            print(f"  Mean: {statistics.mean(self.ttft_times):.3f}s")
-            print(f"  Median: {statistics.median(self.ttft_times):.3f}s")
-            print(f"  Min: {min(self.ttft_times):.3f}s")
-            print(f"  Max: {max(self.ttft_times):.3f}s")
-
-            if len(self.ttft_times) > 1:
-                print(f"  Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
-
-            sorted_ttft = sorted(self.ttft_times)
-            print("\nTTFT Percentiles:")
-            for p in percentiles:
-                idx = int(len(sorted_ttft) * p / 100) - 1
-                idx = max(0, min(idx, len(sorted_ttft) - 1))
-                print(f"  P{p}: {sorted_ttft[idx]:.3f}s")
-
-        if self.chunks_received:
-            print("\nStreaming Statistics:")
-            print(f"  Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
-            print(f"  Total chunks received: {sum(self.chunks_received)}")
-
-        print(f"{'=' * 60}")
-        print(f"Total time: {total_time:.2f}s")
-        print(f"Concurrent users: {self.concurrent_users}")
-        print(f"Total requests: {self.total_requests}")
-        print(f"Successful requests: {self.success_count}")
-        print(f"Failed requests: {len(self.errors)}")
-        print(f"Success rate: {success_rate:.1f}%")
-        print(f"Requests per second: {self.success_count / total_time:.2f}")
-
-        if self.errors:
-            print("\nErrors (showing first 5):")
-            for error in self.errors[:5]:
-                print(f"  {error}")
-
-
-class LlamaStackBenchmark:
-    def __init__(self, base_url: str, model_id: str):
-        self.base_url = base_url.rstrip("/")
-        self.model_id = model_id
-        self.headers = {"Content-Type": "application/json"}
-        self.test_messages = [
-            [{"role": "user", "content": "Hi"}],
-            [{"role": "user", "content": "What is the capital of France?"}],
-            [{"role": "user", "content": "Explain quantum physics in simple terms."}],
-            [{"role": "user", "content": "Write a short story about a robot learning to paint."}],
-            [
-                {"role": "user", "content": "What is machine learning?"},
-                {"role": "assistant", "content": "Machine learning is a subset of AI..."},
-                {"role": "user", "content": "Can you give me a practical example?"},
-            ],
-        ]
-
-    async def make_async_streaming_request(self) -> tuple[float, int, float | None, str | None]:
-        """Make a single async streaming chat completion request."""
-        messages = random.choice(self.test_messages)
-        payload = {"model": self.model_id, "messages": messages, "stream": True, "max_tokens": 100}
-
-        start_time = time.time()
-        chunks_received = 0
-        ttft = None
-        error = None
-
-        session = aiohttp.ClientSession()
-
-        try:
-            async with session.post(
-                f"{self.base_url}/chat/completions",
-                headers=self.headers,
-                json=payload,
-                timeout=aiohttp.ClientTimeout(total=30),
-            ) as response:
-                if response.status == 200:
-                    async for line in response.content:
-                        if line:
-                            line_str = line.decode("utf-8").strip()
-                            if line_str.startswith("data: "):
-                                chunks_received += 1
-                                if ttft is None:
-                                    ttft = time.time() - start_time
-                                if line_str == "data: [DONE]":
-                                    break
-
-                    if chunks_received == 0:
-                        error = "No streaming chunks received"
-                else:
-                    text = await response.text()
-                    error = f"HTTP {response.status}: {text[:100]}"
-
-        except Exception as e:
-            error = f"Request error: {str(e)}"
-        finally:
-            await session.close()
-
-        response_time = time.time() - start_time
-        return response_time, chunks_received, ttft, error
-
-    async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
-        """Run benchmark using async requests for specified duration."""
-        stats = BenchmarkStats()
-        stats.concurrent_users = concurrent_users
-        stats.start_time = time.time()
-
-        print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users")
-        print(f"Target URL: {self.base_url}/chat/completions")
-        print(f"Model: {self.model_id}")
-
-        connector = aiohttp.TCPConnector(limit=concurrent_users)
-        async with aiohttp.ClientSession(connector=connector):
-
-            async def worker(worker_id: int):
-                """Worker that sends requests sequentially until canceled."""
-                request_count = 0
-                while True:
-                    try:
-                        response_time, chunks, ttft, error = await self.make_async_streaming_request()
-                        await stats.add_result(response_time, chunks, ttft, error)
-                        request_count += 1
-
-                    except asyncio.CancelledError:
-                        break
-                    except Exception as e:
-                        await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}")
-
-            # Progress reporting task
-            async def progress_reporter():
-                last_report_time = time.time()
-                while True:
-                    try:
-                        await asyncio.sleep(1)  # Report every second
-                        if time.time() >= last_report_time + 10:  # Report every 10 seconds
-                            elapsed = time.time() - stats.start_time
-                            print(
-                                f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}"
-                            )
-                            last_report_time = time.time()
-                    except asyncio.CancelledError:
-                        break
-
-            # Spawn concurrent workers
-            tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)]
-            progress_task = asyncio.create_task(progress_reporter())
-            tasks.append(progress_task)
-
-            # Wait for duration then cancel all tasks
-            await asyncio.sleep(duration)
-
-            for task in tasks:
-                task.cancel()
-
-            # Wait for all tasks to complete
-            await asyncio.gather(*tasks, return_exceptions=True)
-
-        stats.end_time = time.time()
-        return stats
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
-    parser.add_argument(
-        "--base-url",
-        default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
-        help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)",
-    )
-    parser.add_argument(
-        "--model", default=os.getenv("INFERENCE_MODEL", "test-model"), help="Model ID to use for requests"
-    )
-    parser.add_argument("--duration", type=int, default=60, help="Duration in seconds to run benchmark (default: 60)")
-    parser.add_argument("--concurrent", type=int, default=10, help="Number of concurrent users (default: 10)")
-
-    args = parser.parse_args()
-
-    benchmark = LlamaStackBenchmark(args.base_url, args.model)
-
-    try:
-        stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent))
-        stats.print_summary()
-
-    except KeyboardInterrupt:
-        print("\nBenchmark interrupted by user")
-    except Exception as e:
-        print(f"Benchmark failed: {e}")
-
-
-if __name__ == "__main__":
-    main()
--- a/benchmarking/k8s-benchmark/profile_running_server.sh
+++ b/benchmarking/k8s-benchmark/profile_running_server.sh
@ -1,52 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Script to profile an already running Llama Stack server
-# Usage: ./profile_running_server.sh [duration_seconds] [output_file]
-
-DURATION=${1:-60}  # Default 60 seconds
-OUTPUT_FILE=${2:-"llama_stack_profile"}  # Default output file
-
-echo "Looking for running Llama Stack server..."
-
-# Find the server PID
-SERVER_PID=$(ps aux | grep "llama_stack.core.server.server" | grep -v grep | awk '{print $2}' | head -1)
-
-
-if [ -z "$SERVER_PID" ]; then
-    echo "Error: No running Llama Stack server found"
-    echo "Please start your server first with:"
-    echo "LLAMA_STACK_LOGGING=\"all=ERROR\" MOCK_INFERENCE_URL=http://localhost:8080 SAFETY_MODEL=llama-guard3:1b uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml"
-    exit 1
-fi
-
-echo "Found Llama Stack server with PID: $SERVER_PID"
-
-# Start py-spy profiling
-echo "Starting py-spy profiling for ${DURATION} seconds..."
-echo "Output will be saved to: ${OUTPUT_FILE}.svg"
-echo ""
-echo "You can now run your load test..."
-echo ""
-
-# Get the full path to py-spy
-PYSPY_PATH=$(which py-spy)
-
-# Check if running as root, if not, use sudo
-if [ "$EUID" -ne 0 ]; then
-    echo "py-spy requires root permissions on macOS. Running with sudo..."
-    sudo "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
-else
-    "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
-fi
-
-echo ""
-echo "Profiling completed! Results saved to: ${OUTPUT_FILE}.svg"
-echo ""
-echo "To view the flame graph:"
-echo "open ${OUTPUT_FILE}.svg"
--- a/benchmarking/k8s-benchmark/results/guidellm-benchmark-stack-s1-sw1-v1-20250922-103408.txt
+++ b/benchmarking/k8s-benchmark/results/guidellm-benchmark-stack-s1-sw1-v1-20250922-103408.txt
@ -0,0 +1,171 @@
+Collecting uv
+  Downloading uv-0.8.19-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
+Downloading uv-0.8.19-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.9 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 20.9/20.9 MB 144.3 MB/s eta 0:00:00
+Installing collected packages: uv
+Successfully installed uv-0.8.19
+WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
+
+[notice] A new release of pip is available: 24.0 -> 25.2
+[notice] To update, run: pip install --upgrade pip
+Using Python 3.11.13 environment at: /usr/local
+Resolved 61 packages in 551ms
+Downloading pillow (6.3MiB)
+Downloading hf-xet (3.0MiB)
+Downloading tokenizers (3.1MiB)
+Downloading pygments (1.2MiB)
+Downloading pandas (11.8MiB)
+Downloading aiohttp (1.7MiB)
+Downloading pydantic-core (1.9MiB)
+Downloading numpy (16.2MiB)
+Downloading transformers (11.1MiB)
+Downloading pyarrow (40.8MiB)
+ Downloading pydantic-core
+ Downloading aiohttp
+ Downloading tokenizers
+ Downloading hf-xet
+ Downloading pygments
+ Downloading pillow
+ Downloading numpy
+ Downloading pandas
+ Downloading transformers
+ Downloading pyarrow
+Prepared 61 packages in 1.23s
+Installed 61 packages in 114ms
+ + aiohappyeyeballs==2.6.1
+ + aiohttp==3.12.15
+ + aiosignal==1.4.0
+ + annotated-types==0.7.0
+ + anyio==4.10.0
+ + attrs==25.3.0
+ + certifi==2025.8.3
+ + charset-normalizer==3.4.3
+ + click==8.1.8
+ + datasets==4.1.1
+ + dill==0.4.0
+ + filelock==3.19.1
+ + frozenlist==1.7.0
+ + fsspec==2025.9.0
+ + ftfy==6.3.1
+ + guidellm==0.3.0
+ + h11==0.16.0
+ + h2==4.3.0
+ + hf-xet==1.1.10
+ + hpack==4.1.0
+ + httpcore==1.0.9
+ + httpx==0.28.1
+ + huggingface-hub==0.35.0
+ + hyperframe==6.1.0
+ + idna==3.10
+ + loguru==0.7.3
+ + markdown-it-py==4.0.0
+ + mdurl==0.1.2
+ + multidict==6.6.4
+ + multiprocess==0.70.16
+ + numpy==2.3.3
+ + packaging==25.0
+ + pandas==2.3.2
+ + pillow==11.3.0
+ + propcache==0.3.2
+ + protobuf==6.32.1
+ + pyarrow==21.0.0
+ + pydantic==2.11.9
+ + pydantic-core==2.33.2
+ + pydantic-settings==2.10.1
+ + pygments==2.19.2
+ + python-dateutil==2.9.0.post0
+ + python-dotenv==1.1.1
+ + pytz==2025.2
+ + pyyaml==6.0.2
+ + regex==2025.9.18
+ + requests==2.32.5
+ + rich==14.1.0
+ + safetensors==0.6.2
+ + six==1.17.0
+ + sniffio==1.3.1
+ + tokenizers==0.22.1
+ + tqdm==4.67.1
+ + transformers==4.56.2
+ + typing-extensions==4.15.0
+ + typing-inspection==0.4.1
+ + tzdata==2025.2
+ + urllib3==2.5.0
+ + wcwidth==0.2.14
+ + xxhash==3.5.0
+ + yarl==1.20.1
+Using Python 3.11.13 environment at: /usr/local
+Audited 1 package in 3ms
+Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
+Creating backend...
+Backend openai_http connected to http://llama-stack-benchmark-service:8323/v1/openai for model meta-llama/Llama-3.2-3B-Instruct.
+Creating request loader...
+Created loader with 1000 unique requests from prompt_tokens=512,output_tokens=256.
+
+
+╭─ Benchmarks ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ [17:34:30] ⠋ 100% concurrent@1   (complete)   Req:    0.3 req/s,    3.32s Lat,     1.0 Conc,      18 Comp,        1 Inc,        0 Err                                                                │
+│                                               Tok:   74.0 gen/s,  238.6 tot/s,  40.2ms TTFT,   13.4ms ITL,   546 Prompt,      246 Gen                                                                │
+│ [17:35:35] ⠋ 100% concurrent@2   (complete)   Req:    0.6 req/s,    3.46s Lat,     2.0 Conc,      34 Comp,        2 Inc,        0 Err                                                                │
+│                                               Tok:  139.6 gen/s,  454.0 tot/s,  48.0ms TTFT,   14.1ms ITL,   546 Prompt,      243 Gen                                                                │
+│ [17:36:40] ⠋ 100% concurrent@4   (complete)   Req:    1.1 req/s,    3.44s Lat,     3.9 Conc,      68 Comp,        4 Inc,        0 Err                                                                │
+│                                               Tok:  273.2 gen/s,  900.4 tot/s,  50.7ms TTFT,   14.3ms ITL,   546 Prompt,      238 Gen                                                                │
+│ [17:37:45] ⠋ 100% concurrent@8   (complete)   Req:    2.2 req/s,    3.55s Lat,     7.7 Conc,     129 Comp,        8 Inc,        0 Err                                                                │
+│                                               Tok:  519.1 gen/s, 1699.8 tot/s,  66.0ms TTFT,   14.6ms ITL,   547 Prompt,      240 Gen                                                                │
+│ [17:38:50] ⠋ 100% concurrent@16  (complete)   Req:    4.1 req/s,    3.76s Lat,    15.5 Conc,     247 Comp,       16 Inc,        0 Err                                                                │
+│                                               Tok: 1005.5 gen/s, 3256.7 tot/s, 101.0ms TTFT,   15.0ms ITL,   547 Prompt,      244 Gen                                                                │
+│ [17:39:56] ⠋ 100% concurrent@32  (complete)   Req:    8.1 req/s,    3.84s Lat,    30.9 Conc,     483 Comp,       32 Inc,        0 Err                                                                │
+│                                               Tok: 1926.3 gen/s, 6327.2 tot/s, 295.7ms TTFT,   14.8ms ITL,   547 Prompt,      239 Gen                                                                │
+│ [17:41:03] ⠋ 100% concurrent@64  (complete)   Req:    9.9 req/s,    6.05s Lat,    59.7 Conc,     576 Comp,       58 Inc,        0 Err                                                                │
+│                                               Tok: 2381.0 gen/s, 7774.5 tot/s, 1196.2ms TTFT,   20.2ms ITL,   547 Prompt,      241 Gen                                                               │
+│ [17:42:10] ⠋ 100% concurrent@128 (complete)   Req:    9.2 req/s,   11.59s Lat,   107.2 Conc,     514 Comp,      117 Inc,        0 Err                                                                │
+│                                               Tok: 2233.4 gen/s, 7286.3 tot/s, 2403.9ms TTFT,   38.2ms ITL,   547 Prompt,      242 Gen                                                               │
+╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+Generating... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ (8/8) [ 0:08:41 < 0:00:00 ]
+
+Benchmarks Metadata:
+    Run id:511a14fd-ba11-4ffa-92ef-7cc23db4dd38
+    Duration:528.5 seconds
+    Profile:type=concurrent, strategies=['concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent'], streams=[1, 2, 4, 8, 16, 32, 64, 128]
+    Args:max_number=None, max_duration=60.0, warmup_number=None, warmup_duration=3.0, cooldown_number=None, cooldown_duration=None
+    Worker:type_='generative_requests_worker' backend_type='openai_http' backend_target='http://llama-stack-benchmark-service:8323/v1/openai' backend_model='meta-llama/Llama-3.2-3B-Instruct'
+    backend_info={'max_output_tokens': 16384, 'timeout': 300, 'http2': True, 'follow_redirects': True, 'headers': {}, 'text_completions_path': '/v1/completions', 'chat_completions_path':
+    '/v1/chat/completions'}
+    Request Loader:type_='generative_request_loader' data='prompt_tokens=512,output_tokens=256' data_args=None processor='meta-llama/Llama-3.2-3B-Instruct' processor_args=None
+    Extras:None
+
+
+Benchmarks Info:
+===================================================================================================================================================
+Metadata                                       |||| Requests Made  ||| Prompt Tok/Req ||| Output Tok/Req ||| Prompt Tok Total||| Output Tok Total||
+     Benchmark| Start Time| End Time| Duration (s)|  Comp|  Inc|  Err|  Comp|   Inc| Err|  Comp|   Inc| Err|   Comp|   Inc| Err|   Comp|   Inc| Err
+--------------|-----------|---------|-------------|------|-----|-----|------|------|----|------|------|----|-------|------|----|-------|------|----
+  concurrent@1|   17:34:35| 17:35:35|         60.0|    18|    1|    0| 546.4| 512.0| 0.0| 246.0|  14.0| 0.0|   9835|   512|   0|   4428|    14|   0
+  concurrent@2|   17:35:40| 17:36:40|         60.0|    34|    2|    0| 546.4| 512.0| 0.0| 242.7|  80.0| 0.0|  18577|  1024|   0|   8253|   160|   0
+  concurrent@4|   17:36:45| 17:37:45|         60.0|    68|    4|    0| 546.4| 512.0| 0.0| 238.1| 103.2| 0.0|  37156|  2048|   0|  16188|   413|   0
+  concurrent@8|   17:37:50| 17:38:50|         60.0|   129|    8|    0| 546.7| 512.0| 0.0| 240.3| 180.0| 0.0|  70518|  4096|   0|  31001|  1440|   0
+ concurrent@16|   17:38:55| 17:39:55|         60.0|   247|   16|    0| 546.6| 512.0| 0.0| 244.1| 142.6| 0.0| 135002|  8192|   0|  60300|  2281|   0
+ concurrent@32|   17:40:01| 17:41:01|         60.0|   483|   32|    0| 546.5| 512.0| 0.0| 239.2| 123.2| 0.0| 263972| 16384|   0| 115540|  3944|   0
+ concurrent@64|   17:41:08| 17:42:08|         60.0|   576|   58|    0| 546.6| 512.0| 0.0| 241.3|  13.9| 0.0| 314817| 29696|   0| 138976|   807|   0
+concurrent@128|   17:42:15| 17:43:15|         60.0|   514|  117|    0| 546.5| 512.0| 0.0| 241.6| 143.9| 0.0| 280911| 59904|   0| 124160| 16832|   0
+===================================================================================================================================================
+
+
+Benchmarks Stats:
+=======================================================================================================================================================
+Metadata      | Request Stats         || Out Tok/sec| Tot Tok/sec| Req Latency (sec) ||| TTFT (ms)           ||| ITL (ms)        ||| TPOT (ms)       ||
+     Benchmark| Per Second| Concurrency|        mean|        mean|  mean| median|   p99|   mean| median|    p99| mean| median|  p99| mean| median|  p99
+--------------|-----------|------------|------------|------------|------|-------|------|-------|-------|-------|-----|-------|-----|-----|-------|-----
+  concurrent@1|       0.30|        1.00|        74.0|       238.6|  3.32|   3.43|  3.61|   40.2|   39.3|   51.2| 13.4|   13.3| 14.0| 13.3|   13.2| 13.9
+  concurrent@2|       0.58|        1.99|       139.6|       454.0|  3.46|   3.64|  3.74|   48.0|   45.8|   72.0| 14.1|   14.1| 14.5| 14.0|   14.0| 14.4
+  concurrent@4|       1.15|        3.95|       273.2|       900.4|  3.44|   3.69|  3.74|   50.7|   47.2|  118.6| 14.3|   14.3| 14.4| 14.2|   14.2| 14.4
+  concurrent@8|       2.16|        7.67|       519.1|      1699.8|  3.55|   3.76|  3.87|   66.0|   48.8|  208.2| 14.6|   14.5| 14.8| 14.5|   14.5| 14.8
+ concurrent@16|       4.12|       15.48|      1005.5|      3256.7|  3.76|   3.90|  4.18|  101.0|   65.6|  396.7| 15.0|   15.0| 15.9| 15.0|   15.0| 15.9
+ concurrent@32|       8.05|       30.89|      1926.3|      6327.2|  3.84|   4.04|  4.39|  295.7|  265.6|  720.4| 14.8|   14.9| 15.5| 14.8|   14.8| 15.3
+ concurrent@64|       9.87|       59.74|      2381.0|      7774.5|  6.05|   6.18|  9.94| 1196.2| 1122.5| 4295.3| 20.2|   20.0| 25.8| 20.1|   19.9| 25.8
+concurrent@128|       9.25|      107.16|      2233.4|      7286.3| 11.59|  12.04| 14.46| 2403.9| 2322.3| 4001.5| 38.2|   38.5| 53.0| 38.0|   38.3| 52.7
+=======================================================================================================================================================
+
+Saving benchmarks report...
+Benchmarks report saved to /benchmarks.json
+
+Benchmarking complete.
--- a/benchmarking/k8s-benchmark/results/guidellm-benchmark-stack-s1-sw2-v1-20250922-104457.txt
+++ b/benchmarking/k8s-benchmark/results/guidellm-benchmark-stack-s1-sw2-v1-20250922-104457.txt
@ -0,0 +1,171 @@
+Collecting uv
+  Downloading uv-0.8.19-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
+Downloading uv-0.8.19-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.9 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 20.9/20.9 MB 149.3 MB/s eta 0:00:00
+Installing collected packages: uv
+Successfully installed uv-0.8.19
+WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
+
+[notice] A new release of pip is available: 24.0 -> 25.2
+[notice] To update, run: pip install --upgrade pip
+Using Python 3.11.13 environment at: /usr/local
+Resolved 61 packages in 494ms
+Downloading pandas (11.8MiB)
+Downloading tokenizers (3.1MiB)
+Downloading pygments (1.2MiB)
+Downloading aiohttp (1.7MiB)
+Downloading transformers (11.1MiB)
+Downloading numpy (16.2MiB)
+Downloading pillow (6.3MiB)
+Downloading pydantic-core (1.9MiB)
+Downloading hf-xet (3.0MiB)
+Downloading pyarrow (40.8MiB)
+ Downloading pydantic-core
+ Downloading aiohttp
+ Downloading tokenizers
+ Downloading hf-xet
+ Downloading pillow
+ Downloading pygments
+ Downloading numpy
+ Downloading pandas
+ Downloading pyarrow
+ Downloading transformers
+Prepared 61 packages in 1.24s
+Installed 61 packages in 126ms
+ + aiohappyeyeballs==2.6.1
+ + aiohttp==3.12.15
+ + aiosignal==1.4.0
+ + annotated-types==0.7.0
+ + anyio==4.10.0
+ + attrs==25.3.0
+ + certifi==2025.8.3
+ + charset-normalizer==3.4.3
+ + click==8.1.8
+ + datasets==4.1.1
+ + dill==0.4.0
+ + filelock==3.19.1
+ + frozenlist==1.7.0
+ + fsspec==2025.9.0
+ + ftfy==6.3.1
+ + guidellm==0.3.0
+ + h11==0.16.0
+ + h2==4.3.0
+ + hf-xet==1.1.10
+ + hpack==4.1.0
+ + httpcore==1.0.9
+ + httpx==0.28.1
+ + huggingface-hub==0.35.0
+ + hyperframe==6.1.0
+ + idna==3.10
+ + loguru==0.7.3
+ + markdown-it-py==4.0.0
+ + mdurl==0.1.2
+ + multidict==6.6.4
+ + multiprocess==0.70.16
+ + numpy==2.3.3
+ + packaging==25.0
+ + pandas==2.3.2
+ + pillow==11.3.0
+ + propcache==0.3.2
+ + protobuf==6.32.1
+ + pyarrow==21.0.0
+ + pydantic==2.11.9
+ + pydantic-core==2.33.2
+ + pydantic-settings==2.10.1
+ + pygments==2.19.2
+ + python-dateutil==2.9.0.post0
+ + python-dotenv==1.1.1
+ + pytz==2025.2
+ + pyyaml==6.0.2
+ + regex==2025.9.18
+ + requests==2.32.5
+ + rich==14.1.0
+ + safetensors==0.6.2
+ + six==1.17.0
+ + sniffio==1.3.1
+ + tokenizers==0.22.1
+ + tqdm==4.67.1
+ + transformers==4.56.2
+ + typing-extensions==4.15.0
+ + typing-inspection==0.4.1
+ + tzdata==2025.2
+ + urllib3==2.5.0
+ + wcwidth==0.2.14
+ + xxhash==3.5.0
+ + yarl==1.20.1
+Using Python 3.11.13 environment at: /usr/local
+Audited 1 package in 3ms
+Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
+Creating backend...
+Backend openai_http connected to http://llama-stack-benchmark-service:8323/v1/openai for model meta-llama/Llama-3.2-3B-Instruct.
+Creating request loader...
+Created loader with 1000 unique requests from prompt_tokens=512,output_tokens=256.
+
+
+╭─ Benchmarks ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ [17:45:18] ⠋ 100% concurrent@1   (complete)   Req:    0.3 req/s,    3.42s Lat,     1.0 Conc,      17 Comp,        1 Inc,        0 Err                                                                │
+│                                               Tok:   73.9 gen/s,  233.7 tot/s,  50.2ms TTFT,   13.4ms ITL,   547 Prompt,      253 Gen                                                                │
+│ [17:46:23] ⠋ 100% concurrent@2   (complete)   Req:    0.6 req/s,    3.42s Lat,     2.0 Conc,      34 Comp,        2 Inc,        0 Err                                                                │
+│                                               Tok:  134.7 gen/s,  447.4 tot/s,  50.8ms TTFT,   14.3ms ITL,   546 Prompt,      235 Gen                                                                │
+│ [17:47:28] ⠋ 100% concurrent@4   (complete)   Req:    1.1 req/s,    3.55s Lat,     3.9 Conc,      66 Comp,        4 Inc,        0 Err                                                                │
+│                                               Tok:  268.7 gen/s,  873.1 tot/s,  54.9ms TTFT,   14.4ms ITL,   547 Prompt,      243 Gen                                                                │
+│ [17:48:33] ⠋ 100% concurrent@8   (complete)   Req:    2.2 req/s,    3.56s Lat,     7.8 Conc,     130 Comp,        8 Inc,        0 Err                                                                │
+│                                               Tok:  526.1 gen/s, 1728.4 tot/s,  60.6ms TTFT,   14.7ms ITL,   547 Prompt,      239 Gen                                                                │
+│ [17:49:38] ⠋ 100% concurrent@16  (complete)   Req:    4.1 req/s,    3.79s Lat,    15.7 Conc,     246 Comp,       16 Inc,        0 Err                                                                │
+│                                               Tok: 1006.9 gen/s, 3268.6 tot/s,  74.8ms TTFT,   15.3ms ITL,   547 Prompt,      243 Gen                                                                │
+│ [17:50:44] ⠋ 100% concurrent@32  (complete)   Req:    7.8 req/s,    3.95s Lat,    30.9 Conc,     467 Comp,       32 Inc,        0 Err                                                                │
+│                                               Tok: 1912.0 gen/s, 6191.6 tot/s, 119.1ms TTFT,   15.7ms ITL,   547 Prompt,      244 Gen                                                                │
+│ [17:51:50] ⠋ 100% concurrent@64  (complete)   Req:   13.0 req/s,    4.75s Lat,    61.8 Conc,     776 Comp,       64 Inc,        0 Err                                                                │
+│                                               Tok: 3154.3 gen/s, 10273.3 tot/s, 339.1ms TTFT,   18.3ms ITL,   547 Prompt,      242 Gen                                                               │
+│ [17:52:58] ⠋ 100% concurrent@128 (complete)   Req:   15.1 req/s,    7.82s Lat,   117.7 Conc,     898 Comp,      127 Inc,        0 Err                                                                │
+│                                               Tok: 3617.4 gen/s, 11843.9 tot/s, 1393.8ms TTFT,   26.8ms ITL,   547 Prompt,      240 Gen                                                              │
+╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+Generating... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ (8/8) [ 0:08:41 < 0:00:00 ]
+
+Benchmarks Metadata:
+    Run id:f73d408e-256a-4c32-aa40-05e8d7098b66
+    Duration:529.2 seconds
+    Profile:type=concurrent, strategies=['concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent'], streams=[1, 2, 4, 8, 16, 32, 64, 128]
+    Args:max_number=None, max_duration=60.0, warmup_number=None, warmup_duration=3.0, cooldown_number=None, cooldown_duration=None
+    Worker:type_='generative_requests_worker' backend_type='openai_http' backend_target='http://llama-stack-benchmark-service:8323/v1/openai' backend_model='meta-llama/Llama-3.2-3B-Instruct'
+    backend_info={'max_output_tokens': 16384, 'timeout': 300, 'http2': True, 'follow_redirects': True, 'headers': {}, 'text_completions_path': '/v1/completions', 'chat_completions_path':
+    '/v1/chat/completions'}
+    Request Loader:type_='generative_request_loader' data='prompt_tokens=512,output_tokens=256' data_args=None processor='meta-llama/Llama-3.2-3B-Instruct' processor_args=None
+    Extras:None
+
+
+Benchmarks Info:
+=====================================================================================================================================================
+Metadata                                       |||| Requests Made  ||| Prompt Tok/Req ||| Output Tok/Req ||| Prompt Tok Total||| Output Tok Total  ||
+     Benchmark| Start Time| End Time| Duration (s)|  Comp|  Inc|  Err|  Comp|   Inc| Err|  Comp|   Inc| Err|   Comp|   Inc| Err|    Comp|   Inc|  Err
+--------------|-----------|---------|-------------|------|-----|-----|------|------|----|------|------|----|-------|------|----|--------|------|-----
+  concurrent@1|   17:45:23| 17:46:23|         60.0|    17|    1|    0| 546.6| 512.0| 0.0| 252.8| 136.0| 0.0|   9292|   512|   0|    4298|   136|    0
+  concurrent@2|   17:46:28| 17:47:28|         60.0|    34|    2|    0| 546.4| 512.0| 0.0| 235.4| 130.0| 0.0|  18577|  1024|   0|    8003|   260|    0
+  concurrent@4|   17:47:33| 17:48:33|         60.0|    66|    4|    0| 546.5| 512.0| 0.0| 243.0|  97.5| 0.0|  36072|  2048|   0|   16035|   390|    0
+  concurrent@8|   17:48:38| 17:49:38|         60.0|   130|    8|    0| 546.6| 512.0| 0.0| 239.2| 146.0| 0.0|  71052|  4096|   0|   31090|  1168|    0
+ concurrent@16|   17:49:43| 17:50:43|         60.0|   246|   16|    0| 546.6| 512.0| 0.0| 243.3| 112.3| 0.0| 134456|  8192|   0|   59862|  1797|    0
+ concurrent@32|   17:50:49| 17:51:49|         60.0|   467|   32|    0| 546.6| 512.0| 0.0| 244.2| 147.3| 0.0| 255242| 16384|   0|  114038|  4714|    0
+ concurrent@64|   17:51:55| 17:52:55|         60.0|   776|   64|    0| 546.5| 512.0| 0.0| 242.2| 106.1| 0.0| 424115| 32768|   0|  187916|  6788|    0
+concurrent@128|   17:53:03| 17:54:03|         60.0|   898|  127|    0| 546.5| 512.0| 0.0| 240.3|  69.8| 0.0| 490789| 65024|   0|  215810|  8864|    0
+=====================================================================================================================================================
+
+
+Benchmarks Stats:
+======================================================================================================================================================
+Metadata      | Request Stats         || Out Tok/sec| Tot Tok/sec| Req Latency (sec)||| TTFT (ms)           ||| ITL (ms)        ||| TPOT (ms)       ||
+     Benchmark| Per Second| Concurrency|        mean|        mean| mean| median|   p99|   mean| median|    p99| mean| median|  p99| mean| median|  p99
+--------------|-----------|------------|------------|------------|-----|-------|------|-------|-------|-------|-----|-------|-----|-----|-------|-----
+  concurrent@1|       0.29|        1.00|        73.9|       233.7| 3.42|   3.45|  3.50|   50.2|   50.9|   62.5| 13.4|   13.4| 13.5| 13.3|   13.3| 13.5
+  concurrent@2|       0.57|        1.96|       134.7|       447.4| 3.42|   3.67|  4.12|   50.8|   49.2|   79.8| 14.3|   14.2| 15.9| 14.3|   14.2| 15.9
+  concurrent@4|       1.11|        3.92|       268.7|       873.1| 3.55|   3.72|  3.80|   54.9|   51.7|  101.3| 14.4|   14.4| 14.5| 14.4|   14.4| 14.5
+  concurrent@8|       2.20|        7.82|       526.1|      1728.4| 3.56|   3.78|  3.93|   60.6|   49.8|  189.5| 14.7|   14.7| 14.8| 14.6|   14.6| 14.8
+ concurrent@16|       4.14|       15.66|      1006.9|      3268.6| 3.79|   3.94|  4.25|   74.8|   54.3|  328.4| 15.3|   15.3| 16.1| 15.2|   15.2| 16.0
+ concurrent@32|       7.83|       30.91|      1912.0|      6191.6| 3.95|   4.07|  4.53|  119.1|   80.5|  674.0| 15.7|   15.6| 17.4| 15.7|   15.6| 17.3
+ concurrent@64|      13.03|       61.85|      3154.3|     10273.3| 4.75|   4.93|  5.43|  339.1|  321.1| 1146.6| 18.3|   18.4| 19.3| 18.2|   18.3| 19.2
+concurrent@128|      15.05|      117.71|      3617.4|     11843.9| 7.82|   8.58| 13.35| 1393.8| 1453.0| 5232.2| 26.8|   26.7| 36.0| 26.7|   26.6| 35.9
+======================================================================================================================================================
+
+Saving benchmarks report...
+Benchmarks report saved to /benchmarks.json
+
+Benchmarking complete.
--- a/benchmarking/k8s-benchmark/results/guidellm-benchmark-stack-s1-sw4-v1-20250922-105539.txt
+++ b/benchmarking/k8s-benchmark/results/guidellm-benchmark-stack-s1-sw4-v1-20250922-105539.txt
@ -0,0 +1,171 @@
+Collecting uv
+  Downloading uv-0.8.19-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
+Downloading uv-0.8.19-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.9 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 20.9/20.9 MB 156.8 MB/s eta 0:00:00
+Installing collected packages: uv
+Successfully installed uv-0.8.19
+WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
+
+[notice] A new release of pip is available: 24.0 -> 25.2
+[notice] To update, run: pip install --upgrade pip
+Using Python 3.11.13 environment at: /usr/local
+Resolved 61 packages in 480ms
+Downloading pillow (6.3MiB)
+Downloading pydantic-core (1.9MiB)
+Downloading pyarrow (40.8MiB)
+Downloading aiohttp (1.7MiB)
+Downloading numpy (16.2MiB)
+Downloading pygments (1.2MiB)
+Downloading transformers (11.1MiB)
+Downloading pandas (11.8MiB)
+Downloading tokenizers (3.1MiB)
+Downloading hf-xet (3.0MiB)
+ Downloading pydantic-core
+ Downloading aiohttp
+ Downloading tokenizers
+ Downloading hf-xet
+ Downloading pygments
+ Downloading pillow
+ Downloading numpy
+ Downloading pandas
+ Downloading pyarrow
+ Downloading transformers
+Prepared 61 packages in 1.25s
+Installed 61 packages in 126ms
+ + aiohappyeyeballs==2.6.1
+ + aiohttp==3.12.15
+ + aiosignal==1.4.0
+ + annotated-types==0.7.0
+ + anyio==4.10.0
+ + attrs==25.3.0
+ + certifi==2025.8.3
+ + charset-normalizer==3.4.3
+ + click==8.1.8
+ + datasets==4.1.1
+ + dill==0.4.0
+ + filelock==3.19.1
+ + frozenlist==1.7.0
+ + fsspec==2025.9.0
+ + ftfy==6.3.1
+ + guidellm==0.3.0
+ + h11==0.16.0
+ + h2==4.3.0
+ + hf-xet==1.1.10
+ + hpack==4.1.0
+ + httpcore==1.0.9
+ + httpx==0.28.1
+ + huggingface-hub==0.35.0
+ + hyperframe==6.1.0
+ + idna==3.10
+ + loguru==0.7.3
+ + markdown-it-py==4.0.0
+ + mdurl==0.1.2
+ + multidict==6.6.4
+ + multiprocess==0.70.16
+ + numpy==2.3.3
+ + packaging==25.0
+ + pandas==2.3.2
+ + pillow==11.3.0
+ + propcache==0.3.2
+ + protobuf==6.32.1
+ + pyarrow==21.0.0
+ + pydantic==2.11.9
+ + pydantic-core==2.33.2
+ + pydantic-settings==2.10.1
+ + pygments==2.19.2
+ + python-dateutil==2.9.0.post0
+ + python-dotenv==1.1.1
+ + pytz==2025.2
+ + pyyaml==6.0.2
+ + regex==2025.9.18
+ + requests==2.32.5
+ + rich==14.1.0
+ + safetensors==0.6.2
+ + six==1.17.0
+ + sniffio==1.3.1
+ + tokenizers==0.22.1
+ + tqdm==4.67.1
+ + transformers==4.56.2
+ + typing-extensions==4.15.0
+ + typing-inspection==0.4.1
+ + tzdata==2025.2
+ + urllib3==2.5.0
+ + wcwidth==0.2.14
+ + xxhash==3.5.0
+ + yarl==1.20.1
+Using Python 3.11.13 environment at: /usr/local
+Audited 1 package in 4ms
+Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
+Creating backend...
+Backend openai_http connected to http://llama-stack-benchmark-service:8323/v1/openai for model meta-llama/Llama-3.2-3B-Instruct.
+Creating request loader...
+Created loader with 1000 unique requests from prompt_tokens=512,output_tokens=256.
+
+
+╭─ Benchmarks ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ [17:55:59] ⠋ 100% concurrent@1   (complete)   Req:    0.3 req/s,    3.33s Lat,     1.0 Conc,      18 Comp,        1 Inc,        0 Err                                                                │
+│                                               Tok:   74.0 gen/s,  238.0 tot/s,  49.6ms TTFT,   13.4ms ITL,   546 Prompt,      246 Gen                                                                │
+│ [17:57:04] ⠋ 100% concurrent@2   (complete)   Req:    0.6 req/s,    3.32s Lat,     1.9 Conc,      35 Comp,        2 Inc,        0 Err                                                                │
+│                                               Tok:  137.1 gen/s,  457.5 tot/s,  50.6ms TTFT,   14.0ms ITL,   546 Prompt,      234 Gen                                                                │
+│ [17:58:09] ⠋ 100% concurrent@4   (complete)   Req:    1.2 req/s,    3.42s Lat,     4.0 Conc,      69 Comp,        4 Inc,        0 Err                                                                │
+│                                               Tok:  276.7 gen/s,  907.2 tot/s,  52.7ms TTFT,   14.1ms ITL,   547 Prompt,      240 Gen                                                                │
+│ [17:59:14] ⠋ 100% concurrent@8   (complete)   Req:    2.3 req/s,    3.47s Lat,     7.8 Conc,     134 Comp,        8 Inc,        0 Err                                                                │
+│                                               Tok:  541.4 gen/s, 1775.4 tot/s,  57.3ms TTFT,   14.3ms ITL,   547 Prompt,      240 Gen                                                                │
+│ [18:00:19] ⠋ 100% concurrent@16  (complete)   Req:    4.3 req/s,    3.60s Lat,    15.6 Conc,     259 Comp,       16 Inc,        0 Err                                                                │
+│                                               Tok: 1034.8 gen/s, 3401.7 tot/s,  72.3ms TTFT,   14.8ms ITL,   547 Prompt,      239 Gen                                                                │
+│ [18:01:25] ⠋ 100% concurrent@32  (complete)   Req:    8.4 req/s,    3.69s Lat,    31.1 Conc,     505 Comp,       32 Inc,        0 Err                                                                │
+│                                               Tok: 2029.7 gen/s, 6641.5 tot/s,  91.6ms TTFT,   15.0ms ITL,   547 Prompt,      241 Gen                                                                │
+│ [18:02:31] ⠋ 100% concurrent@64  (complete)   Req:   13.6 req/s,    4.50s Lat,    61.4 Conc,     818 Comp,       64 Inc,        0 Err                                                                │
+│                                               Tok: 3333.9 gen/s, 10787.0 tot/s, 171.3ms TTFT,   17.8ms ITL,   547 Prompt,      244 Gen                                                               │
+│ [18:03:40] ⠋ 100% concurrent@128 (complete)   Req:   16.1 req/s,    7.43s Lat,   119.5 Conc,     964 Comp,      122 Inc,        0 Err                                                                │
+│                                               Tok: 3897.0 gen/s, 12679.4 tot/s, 446.4ms TTFT,   28.9ms ITL,   547 Prompt,      243 Gen                                                               │
+╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+Generating... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ (8/8) [ 0:08:41 < 0:00:00 ]
+
+Benchmarks Metadata:
+    Run id:5393e64f-d9f8-4548-95d8-da320bba1c24
+    Duration:530.1 seconds
+    Profile:type=concurrent, strategies=['concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent'], streams=[1, 2, 4, 8, 16, 32, 64, 128]
+    Args:max_number=None, max_duration=60.0, warmup_number=None, warmup_duration=3.0, cooldown_number=None, cooldown_duration=None
+    Worker:type_='generative_requests_worker' backend_type='openai_http' backend_target='http://llama-stack-benchmark-service:8323/v1/openai' backend_model='meta-llama/Llama-3.2-3B-Instruct'
+    backend_info={'max_output_tokens': 16384, 'timeout': 300, 'http2': True, 'follow_redirects': True, 'headers': {}, 'text_completions_path': '/v1/completions', 'chat_completions_path':
+    '/v1/chat/completions'}
+    Request Loader:type_='generative_request_loader' data='prompt_tokens=512,output_tokens=256' data_args=None processor='meta-llama/Llama-3.2-3B-Instruct' processor_args=None
+    Extras:None
+
+
+Benchmarks Info:
+===================================================================================================================================================
+Metadata                                       |||| Requests Made  ||| Prompt Tok/Req ||| Output Tok/Req ||| Prompt Tok Total||| Output Tok Total||
+     Benchmark| Start Time| End Time| Duration (s)|  Comp|  Inc|  Err|  Comp|   Inc| Err|  Comp|   Inc| Err|   Comp|   Inc| Err|   Comp|   Inc| Err
+--------------|-----------|---------|-------------|------|-----|-----|------|------|----|------|------|----|-------|------|----|-------|------|----
+  concurrent@1|   17:56:04| 17:57:04|         60.0|    18|    1|    0| 546.4| 512.0| 0.0| 246.4| 256.0| 0.0|   9836|   512|   0|   4436|   256|   0
+  concurrent@2|   17:57:09| 17:58:09|         60.0|    35|    2|    0| 546.4| 512.0| 0.0| 233.9| 132.0| 0.0|  19124|  1024|   0|   8188|   264|   0
+  concurrent@4|   17:58:14| 17:59:14|         60.0|    69|    4|    0| 546.6| 512.0| 0.0| 239.9|  60.5| 0.0|  37715|  2048|   0|  16553|   242|   0
+  concurrent@8|   17:59:19| 18:00:19|         60.0|   134|    8|    0| 546.6| 512.0| 0.0| 239.8| 126.6| 0.0|  73243|  4096|   0|  32135|  1013|   0
+ concurrent@16|   18:00:24| 18:01:24|         60.0|   259|   16|    0| 546.6| 512.0| 0.0| 239.0| 115.7| 0.0| 141561|  8192|   0|  61889|  1851|   0
+ concurrent@32|   18:01:30| 18:02:30|         60.0|   505|   32|    0| 546.5| 512.0| 0.0| 240.5| 113.2| 0.0| 275988| 16384|   0| 121466|  3623|   0
+ concurrent@64|   18:02:37| 18:03:37|         60.0|   818|   64|    0| 546.6| 512.0| 0.0| 244.5| 132.4| 0.0| 447087| 32768|   0| 199988|  8475|   0
+concurrent@128|   18:03:45| 18:04:45|         60.0|   964|  122|    0| 546.5| 512.0| 0.0| 242.5| 133.1| 0.0| 526866| 62464|   0| 233789| 16241|   0
+===================================================================================================================================================
+
+
+Benchmarks Stats:
+=======================================================================================================================================================
+Metadata      | Request Stats         || Out Tok/sec| Tot Tok/sec| Req Latency (sec)  ||| TTFT (ms)          ||| ITL (ms)        ||| TPOT (ms)       ||
+     Benchmark| Per Second| Concurrency|        mean|        mean|  mean|  median|   p99|  mean| median|    p99| mean| median|  p99| mean| median|  p99
+--------------|-----------|------------|------------|------------|------|--------|------|------|-------|-------|-----|-------|-----|-----|-------|-----
+  concurrent@1|       0.30|        1.00|        74.0|       238.0|  3.33|    3.44|  3.63|  49.6|   47.2|   66.1| 13.4|   13.3| 14.0| 13.3|   13.3| 14.0
+  concurrent@2|       0.59|        1.95|       137.1|       457.5|  3.32|    3.61|  3.67|  50.6|   48.6|   80.4| 14.0|   14.0| 14.2| 13.9|   13.9| 14.1
+  concurrent@4|       1.15|        3.95|       276.7|       907.2|  3.42|    3.61|  3.77|  52.7|   49.7|  106.9| 14.1|   14.0| 14.6| 14.0|   13.9| 14.5
+  concurrent@8|       2.26|        7.83|       541.4|      1775.4|  3.47|    3.70|  3.79|  57.3|   50.9|  171.3| 14.3|   14.3| 14.4| 14.2|   14.2| 14.4
+ concurrent@16|       4.33|       15.57|      1034.8|      3401.7|  3.60|    3.81|  4.22|  72.3|   52.0|  292.9| 14.8|   14.7| 16.3| 14.7|   14.7| 16.3
+ concurrent@32|       8.44|       31.12|      2029.7|      6641.5|  3.69|    3.89|  4.24|  91.6|   62.6|  504.6| 15.0|   15.0| 15.4| 14.9|   14.9| 15.4
+ concurrent@64|      13.64|       61.40|      3333.9|     10787.0|  4.50|    4.61|  5.67| 171.3|  101.2| 1165.6| 17.8|   17.7| 19.2| 17.7|   17.6| 19.1
+concurrent@128|      16.07|      119.45|      3897.0|     12679.4|  7.43|    7.63|  9.74| 446.4|  195.8| 2533.1| 28.9|   28.9| 31.0| 28.8|   28.8| 30.9
+=======================================================================================================================================================
+
+Saving benchmarks report...
+Benchmarks report saved to /benchmarks.json
+
+Benchmarking complete.
--- a/benchmarking/k8s-benchmark/results/guidellm-benchmark-vllm-v1-20250922-111127.txt
+++ b/benchmarking/k8s-benchmark/results/guidellm-benchmark-vllm-v1-20250922-111127.txt
@ -0,0 +1,170 @@
+Collecting uv
+  Downloading uv-0.8.19-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
+Downloading uv-0.8.19-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.9 MB)
+   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 20.9/20.9 MB 126.9 MB/s eta 0:00:00
+Installing collected packages: uv
+Successfully installed uv-0.8.19
+WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
+
+[notice] A new release of pip is available: 24.0 -> 25.2
+[notice] To update, run: pip install --upgrade pip
+Using Python 3.11.13 environment at: /usr/local
+Resolved 61 packages in 561ms
+Downloading hf-xet (3.0MiB)
+Downloading pillow (6.3MiB)
+Downloading transformers (11.1MiB)
+Downloading pyarrow (40.8MiB)
+Downloading numpy (16.2MiB)
+Downloading pandas (11.8MiB)
+Downloading tokenizers (3.1MiB)
+Downloading pydantic-core (1.9MiB)
+Downloading pygments (1.2MiB)
+Downloading aiohttp (1.7MiB)
+ Downloading pydantic-core
+ Downloading aiohttp
+ Downloading tokenizers
+ Downloading hf-xet
+ Downloading pygments
+ Downloading pillow
+ Downloading numpy
+ Downloading pandas
+ Downloading transformers
+ Downloading pyarrow
+Prepared 61 packages in 1.25s
+Installed 61 packages in 114ms
+ + aiohappyeyeballs==2.6.1
+ + aiohttp==3.12.15
+ + aiosignal==1.4.0
+ + annotated-types==0.7.0
+ + anyio==4.10.0
+ + attrs==25.3.0
+ + certifi==2025.8.3
+ + charset-normalizer==3.4.3
+ + click==8.1.8
+ + datasets==4.1.1
+ + dill==0.4.0
+ + filelock==3.19.1
+ + frozenlist==1.7.0
+ + fsspec==2025.9.0
+ + ftfy==6.3.1
+ + guidellm==0.3.0
+ + h11==0.16.0
+ + h2==4.3.0
+ + hf-xet==1.1.10
+ + hpack==4.1.0
+ + httpcore==1.0.9
+ + httpx==0.28.1
+ + huggingface-hub==0.35.0
+ + hyperframe==6.1.0
+ + idna==3.10
+ + loguru==0.7.3
+ + markdown-it-py==4.0.0
+ + mdurl==0.1.2
+ + multidict==6.6.4
+ + multiprocess==0.70.16
+ + numpy==2.3.3
+ + packaging==25.0
+ + pandas==2.3.2
+ + pillow==11.3.0
+ + propcache==0.3.2
+ + protobuf==6.32.1
+ + pyarrow==21.0.0
+ + pydantic==2.11.9
+ + pydantic-core==2.33.2
+ + pydantic-settings==2.10.1
+ + pygments==2.19.2
+ + python-dateutil==2.9.0.post0
+ + python-dotenv==1.1.1
+ + pytz==2025.2
+ + pyyaml==6.0.2
+ + regex==2025.9.18
+ + requests==2.32.5
+ + rich==14.1.0
+ + safetensors==0.6.2
+ + six==1.17.0
+ + sniffio==1.3.1
+ + tokenizers==0.22.1
+ + tqdm==4.67.1
+ + transformers==4.56.2
+ + typing-extensions==4.15.0
+ + typing-inspection==0.4.1
+ + tzdata==2025.2
+ + urllib3==2.5.0
+ + wcwidth==0.2.14
+ + xxhash==3.5.0
+ + yarl==1.20.1
+Using Python 3.11.13 environment at: /usr/local
+Audited 1 package in 3ms
+Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
+Creating backend...
+Backend openai_http connected to http://vllm-server:8000 for model meta-llama/Llama-3.2-3B-Instruct.
+Creating request loader...
+Created loader with 1000 unique requests from prompt_tokens=512,output_tokens=256.
+
+
+╭─ Benchmarks ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ [18:11:47] ⠋ 100% concurrent@1   (complete)   Req:    0.3 req/s,    3.35s Lat,     1.0 Conc,      17 Comp,        1 Inc,        0 Err                                                                │
+│                                               Tok:   76.4 gen/s,  239.4 tot/s,  29.6ms TTFT,   13.0ms ITL,   547 Prompt,      256 Gen                                                                │
+│ [18:12:52] ⠋ 100% concurrent@2   (complete)   Req:    0.6 req/s,    3.53s Lat,     2.0 Conc,      32 Comp,        2 Inc,        0 Err                                                                │
+│                                               Tok:  145.0 gen/s,  454.5 tot/s,  36.9ms TTFT,   13.7ms ITL,   546 Prompt,      256 Gen                                                                │
+│ [18:13:57] ⠋ 100% concurrent@4   (complete)   Req:    1.1 req/s,    3.59s Lat,     4.0 Conc,      64 Comp,        4 Inc,        0 Err                                                                │
+│                                               Tok:  284.8 gen/s,  892.7 tot/s,  59.0ms TTFT,   13.9ms ITL,   546 Prompt,      256 Gen                                                                │
+│ [18:15:02] ⠋ 100% concurrent@8   (complete)   Req:    2.2 req/s,    3.70s Lat,     8.0 Conc,     128 Comp,        7 Inc,        0 Err                                                                │
+│                                               Tok:  553.5 gen/s, 1735.2 tot/s,  79.8ms TTFT,   14.2ms ITL,   547 Prompt,      256 Gen                                                                │
+│ [18:16:08] ⠋ 100% concurrent@16  (complete)   Req:    4.2 req/s,    3.83s Lat,    16.0 Conc,     240 Comp,       16 Inc,        0 Err                                                                │
+│                                               Tok: 1066.9 gen/s, 3344.6 tot/s,  97.5ms TTFT,   14.6ms ITL,   547 Prompt,      256 Gen                                                                │
+│ [18:17:13] ⠋ 100% concurrent@32  (complete)   Req:    8.1 req/s,    3.94s Lat,    31.8 Conc,     480 Comp,       31 Inc,        0 Err                                                                │
+│                                               Tok: 2069.7 gen/s, 6488.4 tot/s, 120.8ms TTFT,   15.0ms ITL,   547 Prompt,      256 Gen                                                                │
+│ [18:18:20] ⠋ 100% concurrent@64  (complete)   Req:   13.6 req/s,    4.60s Lat,    62.3 Conc,     813 Comp,       57 Inc,        0 Err                                                                │
+│                                               Tok: 3472.1 gen/s, 10884.9 tot/s, 190.9ms TTFT,   17.3ms ITL,   547 Prompt,      256 Gen                                                               │
+│ [18:19:28] ⠋ 100% concurrent@128 (complete)   Req:   16.8 req/s,    7.37s Lat,   123.5 Conc,    1005 Comp,      126 Inc,        0 Err                                                                │
+│                                               Tok: 4289.1 gen/s, 13445.8 tot/s, 356.4ms TTFT,   27.5ms ITL,   547 Prompt,      256 Gen                                                               │
+╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+Generating... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ (8/8) [ 0:08:43 < 0:00:00 ]
+
+Benchmarks Metadata:
+    Run id:8ccb6da1-83f4-4624-8d84-07c723b0b2a5
+    Duration:530.4 seconds
+    Profile:type=concurrent, strategies=['concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent', 'concurrent'], streams=[1, 2, 4, 8, 16, 32, 64, 128]
+    Args:max_number=None, max_duration=60.0, warmup_number=None, warmup_duration=3.0, cooldown_number=None, cooldown_duration=None
+    Worker:type_='generative_requests_worker' backend_type='openai_http' backend_target='http://vllm-server:8000' backend_model='meta-llama/Llama-3.2-3B-Instruct' backend_info={'max_output_tokens':
+    16384, 'timeout': 300, 'http2': True, 'follow_redirects': True, 'headers': {}, 'text_completions_path': '/v1/completions', 'chat_completions_path': '/v1/chat/completions'}
+    Request Loader:type_='generative_request_loader' data='prompt_tokens=512,output_tokens=256' data_args=None processor='meta-llama/Llama-3.2-3B-Instruct' processor_args=None
+    Extras:None
+
+
+Benchmarks Info:
+=====================================================================================================================================================
+Metadata                                       |||| Requests Made  ||| Prompt Tok/Req ||| Output Tok/Req ||| Prompt Tok Total||| Output Tok Total  ||
+     Benchmark| Start Time| End Time| Duration (s)|  Comp|  Inc|  Err|  Comp|   Inc| Err|  Comp|   Inc| Err|   Comp|   Inc| Err|    Comp|   Inc|  Err
+--------------|-----------|---------|-------------|------|-----|-----|------|------|----|------|------|----|-------|------|----|--------|------|-----
+  concurrent@1|   18:11:52| 18:12:52|         60.0|    17|    1|    0| 546.5| 512.0| 0.0| 256.0| 231.0| 0.0|   9291|   512|   0|    4352|   231|    0
+  concurrent@2|   18:12:57| 18:13:57|         60.0|    32|    2|    0| 546.5| 512.0| 0.0| 256.0| 251.0| 0.0|  17488|  1024|   0|    8192|   502|    0
+  concurrent@4|   18:14:02| 18:15:02|         60.0|    64|    4|    0| 546.4| 512.0| 0.0| 256.0| 175.2| 0.0|  34972|  2048|   0|   16384|   701|    0
+  concurrent@8|   18:15:07| 18:16:07|         60.0|   128|    7|    0| 546.6| 512.0| 0.0| 256.0|  50.7| 0.0|  69966|  3584|   0|   32768|   355|    0
+ concurrent@16|   18:16:13| 18:17:13|         60.0|   240|   16|    0| 546.5| 512.0| 0.0| 256.0| 166.0| 0.0| 131170|  8192|   0|   61440|  2656|    0
+ concurrent@32|   18:17:18| 18:18:18|         60.0|   480|   31|    0| 546.5| 512.0| 0.0| 256.0|  47.4| 0.0| 262339| 15872|   0|  122880|  1468|    0
+ concurrent@64|   18:18:25| 18:19:25|         60.0|   813|   57|    0| 546.5| 512.0| 0.0| 256.0| 110.7| 0.0| 444341| 29184|   0|  208128|  6311|    0
+concurrent@128|   18:19:33| 18:20:33|         60.0|  1005|  126|    0| 546.5| 512.0| 0.0| 256.0|  65.8| 0.0| 549264| 64512|   0|  257280|  8296|    0
+=====================================================================================================================================================
+
+
+Benchmarks Stats:
+=======================================================================================================================================================
+Metadata      | Request Stats         || Out Tok/sec| Tot Tok/sec| Req Latency (sec)  ||| TTFT (ms)          ||| ITL (ms)        ||| TPOT (ms)       ||
+     Benchmark| Per Second| Concurrency|        mean|        mean|  mean|  median|   p99|  mean| median|    p99| mean| median|  p99| mean| median|  p99
+--------------|-----------|------------|------------|------------|------|--------|------|------|-------|-------|-----|-------|-----|-----|-------|-----
+  concurrent@1|       0.30|        1.00|        76.4|       239.4|  3.35|    3.35|  3.38|  29.6|   29.0|   38.9| 13.0|   13.0| 13.1| 13.0|   13.0| 13.0
+  concurrent@2|       0.57|        2.00|       145.0|       454.5|  3.53|    3.53|  3.55|  36.9|   39.0|   59.6| 13.7|   13.7| 13.8| 13.6|   13.7| 13.7
+  concurrent@4|       1.11|        4.00|       284.8|       892.7|  3.59|    3.59|  3.65|  59.0|   65.7|   88.2| 13.9|   13.8| 14.1| 13.8|   13.8| 14.0
+  concurrent@8|       2.16|        7.99|       553.5|      1735.2|  3.70|    3.69|  3.76|  79.8|   80.7|  152.6| 14.2|   14.2| 14.5| 14.1|   14.1| 14.4
+ concurrent@16|       4.17|       15.97|      1066.9|      3344.6|  3.83|    3.82|  3.99|  97.5|   96.3|  283.9| 14.6|   14.6| 14.9| 14.6|   14.6| 14.8
+ concurrent@32|       8.08|       31.84|      2069.7|      6488.4|  3.94|    3.90|  4.31| 120.8|  101.7|  564.3| 15.0|   14.9| 15.9| 14.9|   14.8| 15.9
+ concurrent@64|      13.56|       62.34|      3472.1|     10884.9|  4.60|    4.54|  5.43| 190.9|  133.9| 1113.2| 17.3|   17.2| 18.2| 17.2|   17.2| 18.2
+concurrent@128|      16.75|      123.45|      4289.1|     13445.8|  7.37|    7.21|  9.21| 356.4|  161.9| 2319.9| 27.5|   27.5| 28.8| 27.4|   27.4| 28.7
+=======================================================================================================================================================
+
+Saving benchmarks report...
+Benchmarks report saved to /benchmarks.json
+
+Benchmarking complete.
--- a/benchmarking/k8s-benchmark/results/vllm_replica1_benchmark_results.png
+++ b/benchmarking/k8s-benchmark/results/vllm_replica1_benchmark_results.png
--- a/benchmarking/k8s-benchmark/run-benchmark.sh
+++ b/benchmarking/k8s-benchmark/run-benchmark.sh
@ -1,148 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-set -euo pipefail
-
-# Default values
-TARGET="stack"
-DURATION=60
-CONCURRENT=10
-
-# Parse command line arguments
-usage() {
-    echo "Usage: $0 [options]"
-    echo "Options:"
-    echo "  -t, --target <stack|vllm>     Target to benchmark (default: stack)"
-    echo "  -d, --duration <seconds>      Duration in seconds (default: 60)"
-    echo "  -c, --concurrent <users>      Number of concurrent users (default: 10)"
-    echo "  -h, --help                    Show this help message"
-    echo ""
-    echo "Examples:"
-    echo "  $0 --target vllm              # Benchmark vLLM direct"
-    echo "  $0 --target stack             # Benchmark Llama Stack (default)"
-    echo "  $0 -t vllm -d 120 -c 20       # vLLM with 120s duration, 20 users"
-}
-
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        -t|--target)
-            TARGET="$2"
-            shift 2
-            ;;
-        -d|--duration)
-            DURATION="$2"
-            shift 2
-            ;;
-        -c|--concurrent)
-            CONCURRENT="$2"
-            shift 2
-            ;;
-        -h|--help)
-            usage
-            exit 0
-            ;;
-        *)
-            echo "Unknown option: $1"
-            usage
-            exit 1
-            ;;
-    esac
-done
-
-# Validate target
-if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
-    echo "Error: Target must be 'stack' or 'vllm'"
-    usage
-    exit 1
-fi
-
-# Set configuration based on target
-if [[ "$TARGET" == "vllm" ]]; then
-    BASE_URL="http://vllm-server:8000/v1"
-    JOB_NAME="vllm-benchmark-job"
-    echo "Benchmarking vLLM direct..."
-else
-    BASE_URL="http://llama-stack-benchmark-service:8323/v1/openai/v1"
-    JOB_NAME="stack-benchmark-job"
-    echo "Benchmarking Llama Stack..."
-fi
-
-echo "Configuration:"
-echo "  Target: $TARGET"
-echo "  Base URL: $BASE_URL"
-echo "  Duration: ${DURATION}s"
-echo "  Concurrent users: $CONCURRENT"
-echo ""
-
-# Create temporary job yaml
-TEMP_YAML="/tmp/benchmark-job-temp-$(date +%s).yaml"
-cat > "$TEMP_YAML" << EOF
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: $JOB_NAME
-  namespace: default
-spec:
-  template:
-    spec:
-      containers:
-      - name: benchmark
-        image: python:3.11-slim
-        command: ["/bin/bash"]
-        args:
-        - "-c"
-        - |
-          pip install aiohttp &&
-          python3 /benchmark/benchmark.py \\
-            --base-url $BASE_URL \\
-            --model \${INFERENCE_MODEL} \\
-            --duration $DURATION \\
-            --concurrent $CONCURRENT
-        env:
-        - name: INFERENCE_MODEL
-          value: "meta-llama/Llama-3.2-3B-Instruct"
-        volumeMounts:
-        - name: benchmark-script
-          mountPath: /benchmark
-        resources:
-          requests:
-            memory: "256Mi"
-            cpu: "250m"
-          limits:
-            memory: "512Mi"
-            cpu: "500m"
-      volumes:
-      - name: benchmark-script
-        configMap:
-          name: benchmark-script
-      restartPolicy: Never
-  backoffLimit: 3
-EOF
-
-echo "Creating benchmark ConfigMap..."
-kubectl create configmap benchmark-script \
-  --from-file=benchmark.py=benchmark.py \
-  --dry-run=client -o yaml | kubectl apply -f -
-
-echo "Cleaning up any existing benchmark job..."
-kubectl delete job $JOB_NAME 2>/dev/null || true
-
-echo "Deploying benchmark Job..."
-kubectl apply -f "$TEMP_YAML"
-
-echo "Waiting for job to start..."
-kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=60s
-
-echo "Following benchmark logs..."
-kubectl logs -f job/$JOB_NAME
-
-echo "Job completed. Checking final status..."
-kubectl get job $JOB_NAME
-
-# Clean up temporary file
-rm -f "$TEMP_YAML"
--- a/benchmarking/k8s-benchmark/scripts/generate_charts.py
+++ b/benchmarking/k8s-benchmark/scripts/generate_charts.py
@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# /// script
+# dependencies = [
+#   "matplotlib",
+# ]
+# ///
+"""
+Script to generate benchmark charts from guidellm text results.
+Creates 2x2 grid charts with RPS, Request Latency, TTFT, and ITL metrics against concurrent@x values.
+Outputs one chart file per vLLM replica group, with each line representing one benchmark run.
+"""
+
+import glob
+import os
+import re
+
+import matplotlib.pyplot as plt
+
+
+def extract_setup_name(filename: str) -> str:
+    """Extract setup name from filename and format legend appropriately."""
+    basename = os.path.basename(filename)
+
+    # Try new pattern: guidellm-benchmark-stack-s{stack_replicas}-sw{workers}-v{vllm_replicas}-{timestamp}.txt
+    match = re.search(r"guidellm-benchmark-stack-s(\d+)-sw(\d+)-v(\d+)-(\d{8})-(\d{6})\.txt", basename)
+    if match:
+        stack_replicas = match.group(1)
+        workers = match.group(2)
+        vllm_replicas = match.group(3)
+        date = match.group(4)
+        time = match.group(5)
+        return f"stack-s{stack_replicas}-sw{workers}-v{vllm_replicas}"
+
+    # Try new vLLM pattern: guidellm-benchmark-vllm-v{vllm_replicas}-{timestamp}.txt
+    match = re.search(r"guidellm-benchmark-vllm-v(\d+)-(\d{8})-(\d{6})\.txt", basename)
+    if match:
+        vllm_replicas = match.group(1)
+        date = match.group(2)
+        time = match.group(3)
+        return f"vllm-v{vllm_replicas}"
+
+    # Fall back to old pattern: guidellm-benchmark-{target}-{stack_replicas}-w{workers}-{vllm_replicas}-{timestamp}.txt
+    match = re.search(r"guidellm-benchmark-([^-]+)-(\d+)-w(\d+)-(\d+)-(\d+)-(\d+)\.txt", basename)
+    if match:
+        target = match.group(1)
+        stack_replicas = match.group(2)
+        workers = match.group(3)
+        vllm_replicas = match.group(4)
+        date = match.group(5)
+        time = match.group(6)
+
+        if target == "vllm":
+            return f"vllm-{vllm_replicas}-w{workers}-{vllm_replicas}"
+        else:
+            return f"stack-replicas{stack_replicas}-w{workers}-vllm-replicas{vllm_replicas}-{date}-{time}"
+
+    # Fall back to older pattern: guidellm-benchmark-{target}-{stack_replicas}-{vllm_replicas}-{timestamp}.txt
+    match = re.search(r"guidellm-benchmark-([^-]+)-(\d+)-(\d+)-(\d+)-(\d+)\.txt", basename)
+    if match:
+        target = match.group(1)
+        stack_replicas = match.group(2)
+        vllm_replicas = match.group(3)
+        date = match.group(4)
+        time = match.group(5)
+
+        if target == "vllm":
+            return f"vllm-{vllm_replicas}-w1-{vllm_replicas}"
+        else:
+            return f"stack-replicas{stack_replicas}-vllm-replicas{vllm_replicas}-{date}-{time}"
+
+    return basename.replace("guidellm-benchmark-", "").replace(".txt", "")
+
+
+def parse_txt_file(filepath: str) -> list[tuple[float, float, float, float, float, str]]:
+    """
+    Parse a text benchmark file and extract concurrent@x, RPS, TTFT, ITL, and request latency data.
+    Returns list of (concurrency, rps_mean, ttft_mean, itl_mean, req_latency_mean, setup_name) tuples.
+    """
+    setup_name = extract_setup_name(filepath)
+    data_points = []
+
+    try:
+        with open(filepath) as f:
+            content = f.read()
+
+        # Find the benchmark stats table
+        lines = content.split("\n")
+        in_stats_table = False
+        header_lines_seen = 0
+
+        for line in lines:
+            line_stripped = line.strip()
+
+            # Look for the start of the stats table
+            if "Benchmarks Stats:" in line:
+                in_stats_table = True
+                continue
+
+            if in_stats_table:
+                # Skip the first few separator/header lines
+                if line_stripped.startswith("=") or line_stripped.startswith("-"):
+                    header_lines_seen += 1
+                    if header_lines_seen >= 3:  # After seeing multiple header lines, look for concurrent@ data
+                        if line_stripped.startswith("=") and "concurrent@" not in line_stripped:
+                            break
+                    continue
+
+            # Parse concurrent@ lines in the stats table (may have leading spaces)
+            if in_stats_table and "concurrent@" in line:
+                parts = [part.strip() for part in line.split("|")]
+
+                if len(parts) >= 12:  # Make sure we have enough columns for new format
+                    try:
+                        # Extract concurrency from benchmark name (e.g., concurrent@1 -> 1)
+                        concurrent_match = re.search(r"concurrent@(\d+)", parts[0])
+                        if not concurrent_match:
+                            continue
+                        concurrency = float(concurrent_match.group(1))
+
+                        # Extract metrics from the new table format
+                        # From your image, the table has these columns with | separators:
+                        # Benchmark | Per Second | Concurrency | Out Tok/sec | Tot Tok/sec | Req Latency (sec) | TTFT (ms) | ITL (ms) | TPOT (ms)
+                        # Looking at the mean/median/p99 structure, need to find the mean columns
+                        # The structure shows: mean | median | p99 for each metric
+                        rps_mean = float(parts[1])  # Per Second (RPS)
+                        req_latency_mean = float(parts[6]) * 1000  # Request latency mean (convert from sec to ms)
+                        ttft_mean = float(parts[9])  # TTFT mean column
+                        itl_mean = float(parts[12])  # ITL mean column
+
+                        data_points.append((concurrency, rps_mean, ttft_mean, itl_mean, req_latency_mean, setup_name))
+
+                    except (ValueError, IndexError) as e:
+                        print(f"Warning: Could not parse line '{line}' in {filepath}: {e}")
+                        continue
+
+    except (OSError, FileNotFoundError) as e:
+        print(f"Error reading {filepath}: {e}")
+
+    return data_points
+
+
+def generate_charts(benchmark_dir: str = "results"):
+    """Generate 2x2 grid charts (RPS, Request Latency, TTFT, ITL) from benchmark text files."""
+    # Find all text result files instead of JSON
+    txt_pattern = os.path.join(benchmark_dir, "guidellm-benchmark-*.txt")
+    txt_files = glob.glob(txt_pattern)
+
+    if not txt_files:
+        print(f"No text files found matching pattern: {txt_pattern}")
+        return
+
+    print(f"Found {len(txt_files)} text files")
+
+    # Parse all files and collect data
+    all_data = {}  # setup_name -> [(concurrency, rps, ttft, itl, req_latency), ...]
+
+    for txt_file in txt_files:
+        print(f"Processing {txt_file}")
+        data_points = parse_txt_file(txt_file)
+
+        for concurrency, rps, ttft, itl, req_latency, setup_name in data_points:
+            if setup_name not in all_data:
+                all_data[setup_name] = []
+            all_data[setup_name].append((concurrency, rps, ttft, itl, req_latency))
+
+    if not all_data:
+        print("No data found to plot")
+        return
+
+    # Sort data points by concurrency for each setup
+    for setup_name in all_data:
+        all_data[setup_name].sort(key=lambda x: x[0])  # Sort by concurrency
+
+    # Group setups by vLLM replica number (original approach)
+    replica_groups = {}  # vllm_replica_count -> {setup_name: points}
+
+    for setup_name, points in all_data.items():
+        # Extract vLLM replica number from setup name
+        # Expected formats:
+        # - New stack format: "stack-s{X}-sw{W}-v{Y}"
+        # - New vLLM format: "vllm-v{Y}"
+        # - Old formats: "stack-replicas{X}-w{W}-vllm-replicas{Y}" or "vllm-{Y}-w{W}-{Y}"
+
+        # Try new formats first
+        vllm_match = re.search(r"-v(\d+)$", setup_name)  # Matches both "stack-s1-sw2-v3" and "vllm-v1"
+        if not vllm_match:
+            # Try old stack format
+            vllm_match = re.search(r"vllm-replicas(\d+)", setup_name)
+        if not vllm_match:
+            # Try old vLLM format: "vllm-{Y}-w{W}-{Y}"
+            vllm_match = re.search(r"vllm-(\d+)-w\d+-\d+", setup_name)
+
+        if vllm_match:
+            vllm_replica_num = int(vllm_match.group(1))
+            if vllm_replica_num not in replica_groups:
+                replica_groups[vllm_replica_num] = {}
+            replica_groups[vllm_replica_num][setup_name] = points
+        else:
+            print(f"Warning: Could not extract vLLM replica count from setup name: {setup_name}")
+
+    def create_charts(data_dict, prefix, title_prefix):
+        """Create a 2x2 grid with RPS, Request Latency, TTFT, and ITL charts."""
+        if not data_dict:
+            print(f"No data found for {prefix}")
+            return
+
+        # Create 2x2 subplot grid
+        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
+        fig.suptitle(f"{title_prefix} Benchmark Results", fontsize=16, fontweight="bold")
+
+        # Collect all unique concurrency values for tick setting
+        all_concurrency_values = set()
+        for points in data_dict.values():
+            all_concurrency_values.update([p[0] for p in points])
+        all_concurrency_values = sorted(all_concurrency_values)
+
+        # Plot data for each setup in alphabetical order
+        for setup_name in sorted(data_dict.keys()):
+            points = data_dict[setup_name]
+            if not points:
+                continue
+
+            concurrency_values = [p[0] for p in points]
+            rps_values = [p[1] for p in points]
+            ttft_values = [p[2] for p in points]
+            itl_values = [p[3] for p in points]
+            req_latency_values = [p[4] for p in points]
+
+            # RPS chart (top-left)
+            ax1.plot(concurrency_values, rps_values, marker="o", label=setup_name, linewidth=2, markersize=6)
+
+            # Request Latency chart (top-right)
+            ax2.plot(concurrency_values, req_latency_values, marker="o", label=setup_name, linewidth=2, markersize=6)
+
+            # TTFT chart (bottom-left)
+            ax3.plot(concurrency_values, ttft_values, marker="o", label=setup_name, linewidth=2, markersize=6)
+
+            # ITL chart (bottom-right)
+            ax4.plot(concurrency_values, itl_values, marker="o", label=setup_name, linewidth=2, markersize=6)
+
+        # Configure all charts after plotting data
+        axes = [ax1, ax2, ax3, ax4]
+        titles = ["RPS", "Request Latency", "TTFT", "ITL"]
+        ylabels = [
+            "Requests Per Second (RPS)",
+            "Request Latency (ms)",
+            "Time to First Token (ms)",
+            "Inter Token Latency (ms)",
+        ]
+
+        for ax, title, ylabel in zip(axes, titles, ylabels, strict=False):
+            ax.set_xlabel("Concurrency", fontsize=12)
+            ax.set_ylabel(ylabel, fontsize=12)
+            ax.set_title(title, fontsize=14, fontweight="bold")
+            ax.set_xscale("log", base=2)
+            ax.set_xticks(all_concurrency_values)
+            ax.set_xticklabels([str(int(x)) for x in all_concurrency_values])
+            ax.grid(True, alpha=0.3)
+
+        # Add legend to the right-most subplot (top-right)
+        ax2.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
+
+        plt.tight_layout()
+
+        # Save the combined chart
+        combined_filename = os.path.join(benchmark_dir, f"{prefix}_benchmark_results.png")
+        plt.savefig(combined_filename, dpi=300, bbox_inches="tight")
+        plt.close()
+        print(f"Combined benchmark chart saved to {combined_filename}")
+
+    # Print grouping information
+    for replica_count, data_dict in replica_groups.items():
+        print(f"vLLM Replica {replica_count} setups: {list(data_dict.keys())}")
+
+    # Create separate charts for each replica group
+    for replica_count, data_dict in replica_groups.items():
+        prefix = f"vllm_replica{replica_count}"
+        title = f"vLLM Replicas={replica_count}"
+        create_charts(data_dict, prefix, title)
+
+    # Print summary
+    print("\nSummary:")
+    for setup_name, points in all_data.items():
+        print(f"{setup_name}: {len(points)} data points")
+
+
+if __name__ == "__main__":
+    generate_charts()
--- a/benchmarking/k8s-benchmark/scripts/run-all-benchmarks.sh
+++ b/benchmarking/k8s-benchmark/scripts/run-all-benchmarks.sh
@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Define benchmark configurations: (target, stack_replicas, vllm_replicas, stack_workers)
+configs=(
+    "stack 1 1 1"
+    "stack 1 1 2"
+    "stack 1 1 4"
+    "vllm 1 1 -"
+)
+
+set -euo pipefail
+
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo "Running comprehensive GuideLL benchmark suite..."
+echo "Start time: $(date)"
+
+# Default deployment names
+STACK_DEPLOYMENT="llama-stack-benchmark-server"
+VLLM_DEPLOYMENT="vllm-server"
+
+# Scaling function
+scale_deployments() {
+    local stack_replicas=$1
+    local vllm_replicas=$2
+    local workers=$3
+
+    echo "Scaling deployments..."
+
+    if [[ "$vllm_replicas" != "-" ]]; then
+        echo "Scaling $VLLM_DEPLOYMENT to $vllm_replicas replicas..."
+        kubectl scale deployment $VLLM_DEPLOYMENT --replicas=$vllm_replicas
+        kubectl rollout status deployment $VLLM_DEPLOYMENT --timeout=600s
+    fi
+
+    if [[ "$target" == "stack" ]]; then
+        if [[ "$stack_replicas" != "-" ]]; then
+            echo "Scaling $STACK_DEPLOYMENT to $stack_replicas replicas..."
+            kubectl scale deployment $STACK_DEPLOYMENT --replicas=$stack_replicas
+            kubectl rollout status deployment $STACK_DEPLOYMENT --timeout=600s
+        fi
+
+        if [[ "$workers" != "-" ]]; then
+            echo "Updating $STACK_DEPLOYMENT to use $workers workers..."
+            kubectl set env deployment/$STACK_DEPLOYMENT LLAMA_STACK_WORKERS=$workers
+            kubectl rollout status deployment $STACK_DEPLOYMENT --timeout=600s
+        fi
+    fi
+
+    echo "All scaling operations completed. Waiting additional 30s for services to stabilize..."
+    sleep 30
+}
+
+
+for config in "${configs[@]}"; do
+    read -r target stack_replicas vllm_replicas workers <<< "$config"
+
+    echo ""
+    echo "=========================================="
+    if [[ "$workers" != "-" ]]; then
+        echo "Running benchmark: $target (stack=$stack_replicas, vllm=$vllm_replicas, workers=$workers)"
+    else
+        echo "Running benchmark: $target (stack=$stack_replicas, vllm=$vllm_replicas)"
+    fi
+    echo "Start: $(date)"
+    echo "=========================================="
+
+    # Scale deployments before running benchmark
+    scale_deployments "$stack_replicas" "$vllm_replicas" "$workers"
+
+    # Generate output filename with setup info
+    TIMESTAMP=$(date +%Y%m%d-%H%M%S)
+    if [[ "$target" == "stack" ]]; then
+        OUTPUT_FILE="results/guidellm-benchmark-${target}-s${stack_replicas}-sw${workers}-v${vllm_replicas}-${TIMESTAMP}.txt"
+    else
+        OUTPUT_FILE="results/guidellm-benchmark-${target}-v${vllm_replicas}-${TIMESTAMP}.txt"
+    fi
+
+    # Run the benchmark with the cluster as configured
+    "$SCRIPT_DIR/run-guidellm-benchmark.sh" \
+        --target "$target" \
+        --output-file "$OUTPUT_FILE"
+
+    echo "Completed: $(date)"
+    echo "Waiting 30 seconds before next benchmark..."
+    sleep 30
+done
+
+echo ""
+echo "=========================================="
+echo "All benchmarks completed!"
+echo "End time: $(date)"
+echo "=========================================="
+echo ""
+echo "Results files generated:"
+ls -la results/guidellm-*.txt results/guidellm-*.json 2>/dev/null || echo "No result files found"
--- a/benchmarking/k8s-benchmark/scripts/run-guidellm-benchmark.sh
+++ b/benchmarking/k8s-benchmark/scripts/run-guidellm-benchmark.sh
@ -0,0 +1,219 @@
+#!/usr/bin/env bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+set -euo pipefail
+
+# Default values
+TARGET="stack"
+MAX_SECONDS=60
+PROMPT_TOKENS=512
+OUTPUT_TOKENS=256
+RATE_TYPE="concurrent"
+RATE="1,2,4,8,16,32,64,128"
+STACK_DEPLOYMENT="llama-stack-benchmark-server"
+STACK_URL="http://llama-stack-benchmark-service:8323/v1/openai"
+VLLM_DEPLOYMENT="vllm-server"
+OUTPUT_FILE=""
+
+# Parse command line arguments
+usage() {
+    echo "Usage: $0 [options]"
+    echo "Options:"
+    echo "  -t, --target <stack|vllm>     Target to benchmark (default: stack)"
+    echo "  -s, --max-seconds <seconds>   Maximum duration in seconds (default: 60)"
+    echo "  -p, --prompt-tokens <tokens>  Number of prompt tokens (default: 512)"
+    echo "  -o, --output-tokens <tokens>  Number of output tokens (default: 256)"
+    echo "  -r, --rate-type <type>        Rate type (default: concurrent)"
+    echo "  -c, --rate                    Rate (default: 1,2,4,8,16,32,64,128)"
+    echo "  --output-file <path>          Output file path (default: auto-generated)"
+    echo "  --stack-deployment <name>     Name of the stack deployment (default: llama-stack-benchmark-server)"
+    echo "  --vllm-deployment <name>      Name of the vllm deployment (default: vllm-server)"
+    echo "  --stack-url <url>             URL of the stack service (default: http://llama-stack-benchmark-service:8323/v1/openai)"
+    echo "  -h, --help                    Show this help message"
+    echo ""
+    echo "Examples:"
+    echo "  $0 --target vllm                              # Benchmark vLLM direct"
+    echo "  $0 --target stack                             # Benchmark Llama Stack (default)"
+    echo "  $0 -t vllm -s 60 -p 512 -o 256               # vLLM with custom parameters"
+    echo "  $0 --output-file results/my-benchmark.txt     # Specify custom output file"
+    echo "  $0 --stack-deployment my-stack-server         # Use custom stack deployment name"
+}
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -t|--target)
+            TARGET="$2"
+            shift 2
+            ;;
+        -s|--max-seconds)
+            MAX_SECONDS="$2"
+            shift 2
+            ;;
+        -p|--prompt-tokens)
+            PROMPT_TOKENS="$2"
+            shift 2
+            ;;
+        -o|--output-tokens)
+            OUTPUT_TOKENS="$2"
+            shift 2
+            ;;
+        -r|--rate-type)
+            RATE_TYPE="$2"
+            shift 2
+            ;;
+        -c|--rate)
+            RATE="$2"
+            shift 2
+            ;;
+        --output-file)
+            OUTPUT_FILE="$2"
+            shift 2
+            ;;
+        --stack-deployment)
+            STACK_DEPLOYMENT="$2"
+            shift 2
+            ;;
+        --vllm-deployment)
+            VLLM_DEPLOYMENT="$2"
+            shift 2
+            ;;
+        --stack-url)
+            STACK_URL="$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+# Validate target
+if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
+    echo "Error: Target must be 'stack' or 'vllm'"
+    usage
+    exit 1
+fi
+
+# Set configuration based on target
+if [[ "$TARGET" == "vllm" ]]; then
+    BASE_URL="http://${VLLM_DEPLOYMENT}:8000"
+    JOB_NAME="guidellm-vllm-benchmark-job"
+    echo "Benchmarking vLLM direct with GuideLLM..."
+else
+    BASE_URL="$STACK_URL"
+    JOB_NAME="guidellm-stack-benchmark-job"
+    echo "Benchmarking Llama Stack with GuideLLM..."
+fi
+
+
+echo "Configuration:"
+echo "  Target: $TARGET"
+echo "  Base URL: $BASE_URL"
+echo "  Max seconds: ${MAX_SECONDS}s"
+echo "  Prompt tokens: $PROMPT_TOKENS"
+echo "  Output tokens: $OUTPUT_TOKENS"
+echo "  Rate type: $RATE_TYPE"
+if [[ "$TARGET" == "vllm" ]]; then
+    echo "  vLLM deployment: $VLLM_DEPLOYMENT"
+else
+    echo "  Stack deployment: $STACK_DEPLOYMENT"
+fi
+echo ""
+
+# Create temporary job yaml
+TEMP_YAML="/tmp/guidellm-benchmark-job-temp-$(date +%s).yaml"
+cat > "$TEMP_YAML" << EOF
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: $JOB_NAME
+  namespace: default
+spec:
+  template:
+    spec:
+      containers:
+      - name: guidellm-benchmark
+        image: python:3.11-slim
+        command: ["/bin/bash"]
+        args:
+        - "-c"
+        - |
+          # Install uv and guidellm
+          pip install uv &&
+          uv pip install --system guidellm &&
+
+          # Login to HuggingFace
+          uv pip install --system huggingface_hub &&
+          python -c "from huggingface_hub import login; login(token='\$HF_TOKEN')" &&
+
+          # Run GuideLLM benchmark and save output
+          export COLUMNS=200
+          GUIDELLM__PREFERRED_ROUTE="chat_completions" uv run guidellm benchmark run \\
+            --target "$BASE_URL" \\
+            --rate-type "$RATE_TYPE" \\
+            --max-seconds $MAX_SECONDS \\
+            --data "prompt_tokens=$PROMPT_TOKENS,output_tokens=$OUTPUT_TOKENS" \\
+            --model "$INFERENCE_MODEL" \\
+            --rate "$RATE" \\
+            --warmup-percent 0.05 \\
+            2>&1
+        env:
+        - name: INFERENCE_MODEL
+          value: "meta-llama/Llama-3.2-3B-Instruct"
+        - name: HF_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        resources:
+          requests:
+            memory: "4Gi"
+            cpu: "500m"
+          limits:
+            memory: "8Gi"
+            cpu: "2000m"
+      restartPolicy: Never
+  backoffLimit: 3
+EOF
+
+echo "Cleaning up any existing GuideLLM benchmark job..."
+kubectl delete job $JOB_NAME 2>/dev/null || true
+
+echo "Deploying GuideLLM benchmark Job..."
+kubectl apply -f "$TEMP_YAML"
+
+echo "Waiting for job to start..."
+kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=120s
+
+# Prepare file names and create results directory
+mkdir -p results
+if [[ -z "$OUTPUT_FILE" ]]; then
+    TIMESTAMP=$(date +%Y%m%d-%H%M%S)
+    OUTPUT_FILE="results/guidellm-benchmark-${TARGET}-${TIMESTAMP}.txt"
+fi
+
+echo "Following GuideLLM benchmark logs..."
+kubectl logs -f job/$JOB_NAME
+
+echo "Job completed. Checking final status..."
+kubectl get job $JOB_NAME
+
+# Save benchmark results using kubectl logs
+echo "Saving benchmark results..."
+kubectl logs job/$JOB_NAME > "$OUTPUT_FILE"
+
+echo "Benchmark output saved to: $OUTPUT_FILE"
+
+# Clean up temporary file
+rm -f "$TEMP_YAML"
--- a/benchmarking/k8s-benchmark/stack-configmap.yaml
+++ b/benchmarking/k8s-benchmark/stack-configmap.yaml
@ -5,6 +5,7 @@ data:
    image_name: kubernetes-benchmark-demo
    apis:
    - agents
+    - files
    - inference
    - files
    - safety
@ -23,6 +24,14 @@ data:
      - provider_id: sentence-transformers
        provider_type: inline::sentence-transformers
        config: {}
+      files:
+      - provider_id: meta-reference-files
+        provider_type: inline::localfs
+        config:
+          storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+          metadata_store:
+            type: sqlite
+            db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
      vector_io:
      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
        provider_type: remote::chromadb
--- a/benchmarking/k8s-benchmark/stack-k8s.yaml.template
+++ b/benchmarking/k8s-benchmark/stack-k8s.yaml.template
@ -52,9 +52,20 @@ spec:
          value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
        - name: VLLM_TLS_VERIFY
          value: "false"
-        command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
+        - name: LLAMA_STACK_LOGGING
+          value: "all=WARNING"
+        - name: LLAMA_STACK_CONFIG
+          value: "/etc/config/stack_run_config.yaml"
+        - name: LLAMA_STACK_WORKERS
+          value: "${LLAMA_STACK_WORKERS}"
+        command: ["uvicorn", "llama_stack.core.server.server:create_app", "--host", "0.0.0.0", "--port", "8323", "--workers", "$(LLAMA_STACK_WORKERS)", "--factory"]
        ports:
          - containerPort: 8323
+        resources:
+          requests:
+            cpu: "4"
+          limits:
+            cpu: "4"
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.llama
--- a/docs/Makefile
+++ b/docs/Makefile
@ -1,20 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS    ?=
-SPHINXBUILD   ?= sphinx-build
-SOURCEDIR     = source
-BUILDDIR      = _build
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/README.md
+++ b/docs/README.md
@ -1,14 +1,17 @@
 # Llama Stack Documentation

-Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [ReadTheDocs page](https://llama-stack.readthedocs.io/en/latest/index.html).
+Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [Github page](https://llamastack.github.io/getting_started/quickstart).

 ## Render locally

-From the llama-stack root directory, run the following command to render the docs locally:
+From the llama-stack `docs/` directory, run the following commands to render the docs locally:
 ```bash
-uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
+npm install
+npm run gen-api-docs all
+npm run build
+npm run serve
 ```
-You can open up the docs in your browser at http://localhost:8000
+You can open up the docs in your browser at http://localhost:3000

 ## Content

--- a/docs/_static/css/my_theme.css
+++ b/docs/_static/css/my_theme.css
@ -1,136 +0,0 @@
-@import url("theme.css");
-
-/* Horizontal Navigation Bar */
-.horizontal-nav {
-    background-color: #ffffff;
-    border-bottom: 1px solid #e5e5e5;
-    padding: 0;
-    position: fixed;
-    top: 0;
-    left: 0;
-    right: 0;
-    z-index: 1050;
-    height: 50px;
-    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
-}
-
-[data-theme="dark"] .horizontal-nav {
-    background-color: #1a1a1a;
-    border-bottom: 1px solid #333;
-}
-
-.horizontal-nav .nav-container {
-    max-width: 1200px;
-    margin: 0 auto;
-    display: flex;
-    align-items: center;
-    justify-content: space-between;
-    padding: 0 20px;
-    height: 100%;
-}
-
-.horizontal-nav .nav-brand {
-    font-size: 18px;
-    font-weight: 600;
-    color: #333;
-    text-decoration: none;
-}
-
-[data-theme="dark"] .horizontal-nav .nav-brand {
-    color: #fff;
-}
-
-.horizontal-nav .nav-links {
-    display: flex;
-    align-items: center;
-    gap: 30px;
-    list-style: none;
-    margin: 0;
-    padding: 0;
-}
-
-.horizontal-nav .nav-links a {
-    color: #666;
-    text-decoration: none;
-    font-size: 14px;
-    font-weight: 500;
-    padding: 8px 12px;
-    border-radius: 6px;
-    transition: all 0.2s ease;
-}
-
-.horizontal-nav .nav-links a:hover,
-.horizontal-nav .nav-links a.active {
-    color: #333;
-    background-color: #f5f5f5;
-}
-
-.horizontal-nav .nav-links a.active {
-    font-weight: 600;
-}
-
-[data-theme="dark"] .horizontal-nav .nav-links a {
-    color: #ccc;
-}
-
-[data-theme="dark"] .horizontal-nav .nav-links a:hover,
-[data-theme="dark"] .horizontal-nav .nav-links a.active {
-    color: #fff;
-    background-color: #333;
-}
-
-.horizontal-nav .nav-links .github-link {
-    display: flex;
-    align-items: center;
-    gap: 6px;
-}
-
-.horizontal-nav .nav-links .github-icon {
-    width: 16px;
-    height: 16px;
-    fill: currentColor;
-}
-
-/* Adjust main content to account for fixed nav */
-.wy-nav-side {
-    top: 50px;
-    height: calc(100vh - 50px);
-}
-
-.wy-nav-content-wrap {
-    margin-top: 50px;
-}
-
-.wy-nav-content {
-    max-width: 90%;
-}
-
-.wy-nav-side {
-    /* background: linear-gradient(45deg, #2980B9, #16A085); */
-    background: linear-gradient(90deg, #332735, #1b263c);
-}
-
-.wy-side-nav-search {
-    background-color: transparent !important;
-}
-
-.hide-title h1 {
-    display: none;
-}
-
-h2, h3, h4 {
-    font-weight: normal;
-}
-html[data-theme="dark"] .rst-content div[class^="highlight"] {
-  background-color: #0b0b0b;
-}
-pre {
-    white-space: pre-wrap !important;
-    word-break: break-all;
-}
-
-[data-theme="dark"] .mermaid {
-    background-color: #f4f4f6 !important;
-    border-radius: 6px;
-    padding: 0.5em;
-  }
--- a/docs/_static/js/detect_theme.js
+++ b/docs/_static/js/detect_theme.js
@ -1,32 +0,0 @@
-document.addEventListener("DOMContentLoaded", function () {
-  const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
-  const htmlElement = document.documentElement;
-
-  // Check if theme is saved in localStorage
-  const savedTheme = localStorage.getItem("sphinx-rtd-theme");
-
-  if (savedTheme) {
-    // Use the saved theme preference
-    htmlElement.setAttribute("data-theme", savedTheme);
-    document.body.classList.toggle("dark", savedTheme === "dark");
-  } else {
-    // Fall back to system preference
-    const theme = prefersDark ? "dark" : "light";
-    htmlElement.setAttribute("data-theme", theme);
-    document.body.classList.toggle("dark", theme === "dark");
-    // Save initial preference
-    localStorage.setItem("sphinx-rtd-theme", theme);
-  }
-
-  // Listen for theme changes from the existing toggle
-  const observer = new MutationObserver(function(mutations) {
-    mutations.forEach(function(mutation) {
-      if (mutation.attributeName === "data-theme") {
-        const currentTheme = htmlElement.getAttribute("data-theme");
-        localStorage.setItem("sphinx-rtd-theme", currentTheme);
-      }
-    });
-  });
-
-  observer.observe(htmlElement, { attributes: true });
-});
--- a/docs/_static/js/horizontal_nav.js
+++ b/docs/_static/js/horizontal_nav.js
@ -1,44 +0,0 @@
-// Horizontal Navigation Bar for Llama Stack Documentation
-document.addEventListener('DOMContentLoaded', function() {
-    // Create the horizontal navigation HTML
-    const navHTML = `
-        <nav class="horizontal-nav">
-            <div class="nav-container">
-                <a href="/" class="nav-brand">Llama Stack</a>
-                <ul class="nav-links">
-                    <li><a href="/">Docs</a></li>
-                    <li><a href="/references/api_reference/">API Reference</a></li>
-                    <li><a href="https://github.com/meta-llama/llama-stack" target="_blank" class="github-link">
-                        <svg class="github-icon" viewBox="0 0 16 16" aria-hidden="true">
-                            <path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"/>
-                        </svg>
-                        GitHub
-                    </a></li>
-                </ul>
-            </div>
-        </nav>
-    `;
-
-    // Insert the navigation at the beginning of the body
-    document.body.insertAdjacentHTML('afterbegin', navHTML);
-
-    // Update navigation links based on current page
-    updateActiveNav();
-});
-
-function updateActiveNav() {
-    const currentPath = window.location.pathname;
-    const navLinks = document.querySelectorAll('.horizontal-nav .nav-links a');
-
-    navLinks.forEach(link => {
-        // Remove any existing active classes
-        link.classList.remove('active');
-
-        // Add active class based on current path
-        if (currentPath === '/' && link.getAttribute('href') === '/') {
-            link.classList.add('active');
-        } else if (currentPath.includes('/references/api_reference/') && link.getAttribute('href').includes('api_reference')) {
-            link.classList.add('active');
-        }
-    });
-}
--- a/docs/_static/js/keyboard_shortcuts.js
+++ b/docs/_static/js/keyboard_shortcuts.js
@ -1,14 +0,0 @@
-document.addEventListener('keydown', function(event) {
-  // command+K or ctrl+K
-  if ((event.metaKey || event.ctrlKey) && event.key === 'k') {
-    event.preventDefault();
-    document.querySelector('.search-input, .search-field, input[name="q"]').focus();
-  }
-
-  // forward slash
-  if (event.key === '/' &&
-      !event.target.matches('input, textarea, select')) {
-    event.preventDefault();
-    document.querySelector('.search-input, .search-field, input[name="q"]').focus();
-  }
-});
--- a/docs/_static/llama-stack-logo.png
+++ b/docs/_static/llama-stack-logo.png
--- a/docs/conftest.py
+++ b/docs/conftest.py
@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-import time
-
-
-def pytest_collection_modifyitems(items):
-    for item in items:
-        item.name = item.name.replace(' ', '_') 
-
-
-def pytest_runtest_teardown(item):
-    interval_seconds = os.getenv("LLAMA_STACK_TEST_INTERVAL_SECONDS")
-    if interval_seconds:
-        time.sleep(float(interval_seconds))
-
-
-def pytest_configure(config):
-    config.option.tbstyle = "short"
-    config.option.disable_warnings = True
--- a/docs/contbuild.sh
+++ b/docs/contbuild.sh
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-sphinx-autobuild --write-all source build/html --watch source/
--- a/docs/docs/advanced_apis/evaluation.mdx
+++ b/docs/docs/advanced_apis/evaluation.mdx
@ -0,0 +1,163 @@
+# Evaluation
+
+## Evaluation Concepts
+
+The Llama Stack Evaluation flow allows you to run evaluations on your GenAI application datasets or pre-registered benchmarks.
+
+We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications:
+- `/datasetio` + `/datasets` API
+- `/scoring` + `/scoring_functions` API
+- `/eval` + `/benchmarks` API
+
+This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
+
+The Evaluation APIs are associated with a set of Resources. Please visit the Resources section in our [Core Concepts](../concepts/index.mdx) guide for better high-level understanding.
+
+- **DatasetIO**: defines interface with datasets and data loaders.
+  - Associated with `Dataset` resource.
+- **Scoring**: evaluate outputs of the system.
+  - Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
+- **Eval**: generate outputs (via Inference or Agents) and perform scoring.
+  - Associated with `Benchmark` resource.
+
+## Evaluation Providers
+
+Llama Stack provides multiple evaluation providers:
+
+- **Meta Reference** (`inline::meta-reference`) - Meta's reference implementation with multi-language support
+- **NVIDIA** (`remote::nvidia`) - NVIDIA's evaluation platform integration
+
+### Meta Reference
+
+Meta's reference implementation of evaluation tasks with support for multiple languages and evaluation metrics.
+
+#### Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `kvstore` | `RedisKVStoreConfig \| SqliteKVStoreConfig \| PostgresKVStoreConfig \| MongoDBKVStoreConfig` | No | sqlite | Key-value store configuration |
+
+#### Sample Configuration
+
+```yaml
+kvstore:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/meta_reference_eval.db
+```
+
+#### Features
+
+- Multi-language evaluation support
+- Comprehensive evaluation metrics
+- Integration with various key-value stores (SQLite, Redis, PostgreSQL, MongoDB)
+- Built-in support for popular benchmarks
+
+### NVIDIA
+
+NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform.
+
+#### Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `evaluator_url` | `str` | No | http://0.0.0.0:7331 | The url for accessing the evaluator service |
+
+#### Sample Configuration
+
+```yaml
+evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}
+```
+
+#### Features
+
+- Integration with NVIDIA's evaluation platform
+- Remote evaluation capabilities
+- Scalable evaluation processing
+
+## Open-benchmark Eval
+
+### List of open-benchmarks Llama Stack support
+
+Llama stack pre-registers several popular open-benchmarks to easily evaluate model performance via CLI.
+
+The list of open-benchmarks we currently support:
+- [MMLU-COT](https://arxiv.org/abs/2009.03300) (Measuring Massive Multitask Language Understanding): Benchmark designed to comprehensively evaluate the breadth and depth of a model's academic and professional understanding
+- [GPQA-COT](https://arxiv.org/abs/2311.12022) (A Graduate-Level Google-Proof Q&A Benchmark): A challenging benchmark of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry.
+- [SimpleQA](https://openai.com/index/introducing-simpleqa/): Benchmark designed to access models to answer short, fact-seeking questions.
+- [MMMU](https://arxiv.org/abs/2311.16502) (A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI): Benchmark designed to evaluate multimodal models.
+
+You can follow this [contributing guide](../references/evals_reference/index.mdx#open-benchmark-contributing-guide) to add more open-benchmarks to Llama Stack
+
+### Run evaluation on open-benchmarks via CLI
+
+We have built-in functionality to run the supported open-benchmarks using llama-stack-client CLI
+
+#### Spin up Llama Stack server
+
+Spin up llama stack server with 'open-benchmark' template
+```
+llama stack run llama_stack/distributions/open-benchmark/run.yaml
+
+```
+
+#### Run eval CLI
+There are 3 necessary inputs to run a benchmark eval
+- `list of benchmark_ids`: The list of benchmark ids to run evaluation on
+- `model-id`: The model id to evaluate on
+- `output_dir`: Path to store the evaluate results
+```
+llama-stack-client eval run-benchmark <benchmark_id_1> <benchmark_id_2> ... \
+--model_id <model id to evaluate on> \
+--output_dir <directory to store the evaluate results>
+```
+
+You can run
+```
+llama-stack-client eval run-benchmark help
+```
+to see the description of all the flags that eval run-benchmark has
+
+In the output log, you can find the file path that has your evaluation results. Open that file and you can see you aggregate evaluation results over there.
+
+## Usage Example
+
+Here's a basic example of using the evaluation API:
+
+```python
+from llama_stack_client import LlamaStackClient
+
+client = LlamaStackClient(base_url="http://localhost:8321")
+
+# Register a dataset for evaluation
+client.datasets.register(
+    purpose="evaluation",
+    source={
+        "type": "uri",
+        "uri": "huggingface://datasets/llamastack/evaluation_dataset"
+    },
+    dataset_id="my_eval_dataset"
+)
+
+# Run evaluation
+eval_result = client.eval.run_evaluation(
+    dataset_id="my_eval_dataset",
+    scoring_functions=["accuracy", "bleu"],
+    model_id="my_model"
+)
+
+print(f"Evaluation completed: {eval_result}")
+```
+
+## Best Practices
+
+- **Choose appropriate providers**: Use Meta Reference for comprehensive evaluation, NVIDIA for platform-specific needs
+- **Configure storage properly**: Ensure your key-value store configuration matches your performance requirements
+- **Monitor evaluation progress**: Large evaluations can take time - implement proper monitoring
+- **Use appropriate scoring functions**: Select scoring metrics that align with your evaluation goals
+
+## What's Next?
+
+- Check out our Colab notebook on working examples with running benchmark evaluations [here](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb#scrollTo=mxLCsP4MvFqP).
+- Check out our [Building Applications - Evaluation](../building_applications/evals.mdx) guide for more details on how to use the Evaluation APIs to evaluate your applications.
+- Check out our [Evaluation Reference](../references/evals_reference/index.mdx) for more details on the APIs.
+- Explore the [Scoring](./scoring.mdx) documentation for available scoring functions.
--- a/docs/docs/advanced_apis/post_training.mdx
+++ b/docs/docs/advanced_apis/post_training.mdx
@ -0,0 +1,305 @@
+# Post-Training
+
+Post-training in Llama Stack allows you to fine-tune models using various providers and frameworks. This section covers all available post-training providers and how to use them effectively.
+
+## Overview
+
+Llama Stack provides multiple post-training providers:
+
+- **HuggingFace SFTTrainer** (`inline::huggingface`) - Fine-tuning using HuggingFace ecosystem
+- **TorchTune** (`inline::torchtune`) - Fine-tuning using Meta's TorchTune framework
+- **NVIDIA** (`remote::nvidia`) - Fine-tuning using NVIDIA's platform
+
+## HuggingFace SFTTrainer
+
+[HuggingFace SFTTrainer](https://huggingface.co/docs/trl/en/sft_trainer) is an inline post training provider for Llama Stack. It allows you to run supervised fine tuning on a variety of models using many datasets.
+
+### Features
+
+- Simple access through the post_training API
+- Fully integrated with Llama Stack
+- GPU support, CPU support, and MPS support (MacOS Metal Performance Shaders)
+
+### Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `device` | `str` | No | cuda |  |
+| `distributed_backend` | `Literal['fsdp', 'deepspeed']` | No |  |  |
+| `checkpoint_format` | `Literal['full_state', 'huggingface']` | No | huggingface |  |
+| `chat_template` | `str` | No | |
+| `model_specific_config` | `dict` | No | `{'trust_remote_code': True, 'attn_implementation': 'sdpa'}` |  |
+| `max_seq_length` | `int` | No | 2048 |  |
+| `gradient_checkpointing` | `bool` | No | False |  |
+| `save_total_limit` | `int` | No | 3 |  |
+| `logging_steps` | `int` | No | 10 |  |
+| `warmup_ratio` | `float` | No | 0.1 |  |
+| `weight_decay` | `float` | No | 0.01 |  |
+| `dataloader_num_workers` | `int` | No | 4 |  |
+| `dataloader_pin_memory` | `bool` | No | True |  |
+
+### Sample Configuration
+
+```yaml
+checkpoint_format: huggingface
+distributed_backend: null
+device: cpu
+```
+
+### Setup
+
+You can access the HuggingFace trainer via the `starter` distribution:
+
+```bash
+llama stack build --distro starter --image-type venv
+llama stack run --image-type venv ~/.llama/distributions/starter/starter-run.yaml
+```
+
+### Usage Example
+
+```python
+import time
+import uuid
+
+from llama_stack_client.types import (
+    post_training_supervised_fine_tune_params,
+    algorithm_config_param,
+)
+
+def create_http_client():
+    from llama_stack_client import LlamaStackClient
+    return LlamaStackClient(base_url="http://localhost:8321")
+
+client = create_http_client()
+
+# Example Dataset
+client.datasets.register(
+    purpose="post-training/messages",
+    source={
+        "type": "uri",
+        "uri": "huggingface://datasets/llamastack/simpleqa?split=train",
+    },
+    dataset_id="simpleqa",
+)
+
+training_config = post_training_supervised_fine_tune_params.TrainingConfig(
+    data_config=post_training_supervised_fine_tune_params.TrainingConfigDataConfig(
+        batch_size=32,
+        data_format="instruct",
+        dataset_id="simpleqa",
+        shuffle=True,
+    ),
+    gradient_accumulation_steps=1,
+    max_steps_per_epoch=0,
+    max_validation_steps=1,
+    n_epochs=4,
+)
+
+algorithm_config = algorithm_config_param.LoraFinetuningConfig(
+    alpha=1,
+    apply_lora_to_mlp=True,
+    apply_lora_to_output=False,
+    lora_attn_modules=["q_proj"],
+    rank=1,
+    type="LoRA",
+)
+
+job_uuid = f"test-job{uuid.uuid4()}"
+
+# Example Model
+training_model = "ibm-granite/granite-3.3-8b-instruct"
+
+start_time = time.time()
+response = client.post_training.supervised_fine_tune(
+    job_uuid=job_uuid,
+    logger_config={},
+    model=training_model,
+    hyperparam_search_config={},
+    training_config=training_config,
+    algorithm_config=algorithm_config,
+    checkpoint_dir="output",
+)
+print("Job: ", job_uuid)
+
+# Wait for the job to complete!
+while True:
+    status = client.post_training.job.status(job_uuid=job_uuid)
+    if not status:
+        print("Job not found")
+        break
+
+    print(status)
+    if status.status == "completed":
+        break
+
+    print("Waiting for job to complete...")
+    time.sleep(5)
+
+end_time = time.time()
+print("Job completed in", end_time - start_time, "seconds!")
+
+print("Artifacts:")
+print(client.post_training.job.artifacts(job_uuid=job_uuid))
+```
+
+## TorchTune
+
+[TorchTune](https://github.com/pytorch/torchtune) is an inline post training provider for Llama Stack. It provides a simple and efficient way to fine-tune language models using PyTorch.
+
+### Features
+
+- Simple access through the post_training API
+- Fully integrated with Llama Stack
+- GPU support and single device capabilities
+- Support for LoRA
+
+### Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `torch_seed` | `int \| None` | No |  |  |
+| `checkpoint_format` | `Literal['meta', 'huggingface']` | No | meta |  |
+
+### Sample Configuration
+
+```yaml
+checkpoint_format: meta
+```
+
+### Setup
+
+You can access the TorchTune trainer by writing your own yaml pointing to the provider:
+
+```yaml
+post_training:
+  - provider_id: torchtune
+    provider_type: inline::torchtune
+    config: {}
+```
+
+You can then build and run your own stack with this provider.
+
+### Usage Example
+
+```python
+import time
+import uuid
+
+from llama_stack_client.types import (
+    post_training_supervised_fine_tune_params,
+    algorithm_config_param,
+)
+
+def create_http_client():
+    from llama_stack_client import LlamaStackClient
+    return LlamaStackClient(base_url="http://localhost:8321")
+
+client = create_http_client()
+
+# Example Dataset
+client.datasets.register(
+    purpose="post-training/messages",
+    source={
+        "type": "uri",
+        "uri": "huggingface://datasets/llamastack/simpleqa?split=train",
+    },
+    dataset_id="simpleqa",
+)
+
+training_config = post_training_supervised_fine_tune_params.TrainingConfig(
+    data_config=post_training_supervised_fine_tune_params.TrainingConfigDataConfig(
+        batch_size=32,
+        data_format="instruct",
+        dataset_id="simpleqa",
+        shuffle=True,
+    ),
+    gradient_accumulation_steps=1,
+    max_steps_per_epoch=0,
+    max_validation_steps=1,
+    n_epochs=4,
+)
+
+algorithm_config = algorithm_config_param.LoraFinetuningConfig(
+    alpha=1,
+    apply_lora_to_mlp=True,
+    apply_lora_to_output=False,
+    lora_attn_modules=["q_proj"],
+    rank=1,
+    type="LoRA",
+)
+
+job_uuid = f"test-job{uuid.uuid4()}"
+
+# Example Model
+training_model = "meta-llama/Llama-2-7b-hf"
+
+start_time = time.time()
+response = client.post_training.supervised_fine_tune(
+    job_uuid=job_uuid,
+    logger_config={},
+    model=training_model,
+    hyperparam_search_config={},
+    training_config=training_config,
+    algorithm_config=algorithm_config,
+    checkpoint_dir="output",
+)
+print("Job: ", job_uuid)
+
+# Wait for the job to complete!
+while True:
+    status = client.post_training.job.status(job_uuid=job_uuid)
+    if not status:
+        print("Job not found")
+        break
+
+    print(status)
+    if status.status == "completed":
+        break
+
+    print("Waiting for job to complete...")
+    time.sleep(5)
+
+end_time = time.time()
+print("Job completed in", end_time - start_time, "seconds!")
+
+print("Artifacts:")
+print(client.post_training.job.artifacts(job_uuid=job_uuid))
+```
+
+## NVIDIA
+
+NVIDIA's post-training provider for fine-tuning models on NVIDIA's platform.
+
+### Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `api_key` | `str \| None` | No |  | The NVIDIA API key. |
+| `dataset_namespace` | `str \| None` | No | default | The NVIDIA dataset namespace. |
+| `project_id` | `str \| None` | No | test-example-model@v1 | The NVIDIA project ID. |
+| `customizer_url` | `str \| None` | No |  | Base URL for the NeMo Customizer API |
+| `timeout` | `int` | No | 300 | Timeout for the NVIDIA Post Training API |
+| `max_retries` | `int` | No | 3 | Maximum number of retries for the NVIDIA Post Training API |
+| `output_model_dir` | `str` | No | test-example-model@v1 | Directory to save the output model |
+
+### Sample Configuration
+
+```yaml
+api_key: ${env.NVIDIA_API_KEY:=}
+dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
+project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
+customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:=http://nemo.test}
+```
+
+## Best Practices
+
+- **Choose the right provider**: Use HuggingFace for broader compatibility, TorchTune for Meta models, or NVIDIA for their ecosystem
+- **Configure hardware appropriately**: Ensure your configuration matches your available hardware (CPU, GPU, MPS)
+- **Monitor jobs**: Always monitor job status and handle completion appropriately
+- **Use appropriate datasets**: Ensure your dataset format matches the expected input format for your chosen provider
+
+## Next Steps
+
+- Check out the [Building Applications - Fine-tuning](../building_applications/index.mdx) guide for application-level examples
+- See the [Providers](../providers/post_training/index.mdx) section for detailed provider documentation
+- Review the [API Reference](../advanced_apis/post_training.mdx) for complete API documentation
--- a/docs/docs/advanced_apis/scoring.mdx
+++ b/docs/docs/advanced_apis/scoring.mdx
@ -0,0 +1,193 @@
+# Scoring
+
+The Scoring API in Llama Stack allows you to evaluate outputs of your GenAI system using various scoring functions and metrics. This section covers all available scoring providers and their configuration.
+
+## Overview
+
+Llama Stack provides multiple scoring providers:
+
+- **Basic** (`inline::basic`) - Simple evaluation metrics and scoring functions
+- **Braintrust** (`inline::braintrust`) - Advanced evaluation using the Braintrust platform
+- **LLM-as-Judge** (`inline::llm-as-judge`) - Uses language models to evaluate responses
+
+The Scoring API is associated with `ScoringFunction` resources and provides a suite of out-of-the-box scoring functions. You can also add custom evaluators to meet specific evaluation needs.
+
+## Basic Scoring
+
+Basic scoring provider for simple evaluation metrics and scoring functions. This provider offers fundamental scoring capabilities without external dependencies.
+
+### Configuration
+
+No configuration required - this provider works out of the box.
+
+```yaml
+{}
+```
+
+### Features
+
+- Simple evaluation metrics (accuracy, precision, recall, F1-score)
+- String matching and similarity metrics
+- Basic statistical scoring functions
+- No external dependencies required
+- Fast execution for standard metrics
+
+### Use Cases
+
+- Quick evaluation of basic accuracy metrics
+- String similarity comparisons
+- Statistical analysis of model outputs
+- Development and testing scenarios
+
+## Braintrust
+
+Braintrust scoring provider for evaluation and scoring using the [Braintrust platform](https://braintrustdata.com/). Braintrust provides advanced evaluation capabilities and experiment tracking.
+
+### Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `openai_api_key` | `str \| None` | No |  | The OpenAI API Key for LLM-powered evaluations |
+
+### Sample Configuration
+
+```yaml
+openai_api_key: ${env.OPENAI_API_KEY:=}
+```
+
+### Features
+
+- Advanced evaluation metrics
+- Experiment tracking and comparison
+- LLM-powered evaluation functions
+- Integration with Braintrust's evaluation suite
+- Detailed scoring analytics and insights
+
+### Use Cases
+
+- Production evaluation pipelines
+- A/B testing of model versions
+- Advanced scoring with custom metrics
+- Detailed evaluation reporting and analysis
+
+## LLM-as-Judge
+
+LLM-as-judge scoring provider that uses language models to evaluate and score responses. This approach leverages the reasoning capabilities of large language models to assess quality, relevance, and other subjective metrics.
+
+### Configuration
+
+No configuration required - this provider works out of the box.
+
+```yaml
+{}
+```
+
+### Features
+
+- Subjective quality evaluation using LLMs
+- Flexible evaluation criteria definition
+- Natural language evaluation explanations
+- Support for complex evaluation scenarios
+- Contextual understanding of responses
+
+### Use Cases
+
+- Evaluating response quality and relevance
+- Assessing creativity and coherence
+- Subjective metric evaluation
+- Human-like judgment for complex tasks
+
+## Usage Examples
+
+### Basic Scoring Example
+
+```python
+from llama_stack_client import LlamaStackClient
+
+client = LlamaStackClient(base_url="http://localhost:8321")
+
+# Register a basic accuracy scoring function
+client.scoring_functions.register(
+    scoring_function_id="basic_accuracy",
+    provider_id="basic",
+    provider_scoring_function_id="accuracy"
+)
+
+# Use the scoring function
+result = client.scoring.score(
+    input_rows=[
+        {"expected": "Paris", "actual": "Paris"},
+        {"expected": "London", "actual": "Paris"}
+    ],
+    scoring_function_id="basic_accuracy"
+)
+print(f"Accuracy: {result.results[0].score}")
+```
+
+### LLM-as-Judge Example
+
+```python
+# Register an LLM-as-judge scoring function
+client.scoring_functions.register(
+    scoring_function_id="quality_judge",
+    provider_id="llm_judge",
+    provider_scoring_function_id="response_quality",
+    params={
+        "criteria": "Evaluate response quality, relevance, and helpfulness",
+        "scale": "1-10"
+    }
+)
+
+# Score responses using LLM judgment
+result = client.scoring.score(
+    input_rows=[{
+        "query": "What is machine learning?",
+        "response": "Machine learning is a subset of AI that enables computers to learn patterns from data..."
+    }],
+    scoring_function_id="quality_judge"
+)
+```
+
+### Braintrust Integration Example
+
+```python
+# Register a Braintrust scoring function
+client.scoring_functions.register(
+    scoring_function_id="braintrust_eval",
+    provider_id="braintrust",
+    provider_scoring_function_id="semantic_similarity"
+)
+
+# Run evaluation with Braintrust
+result = client.scoring.score(
+    input_rows=[{
+        "reference": "The capital of France is Paris",
+        "candidate": "Paris is the capital city of France"
+    }],
+    scoring_function_id="braintrust_eval"
+)
+```
+
+## Best Practices
+
+- **Choose appropriate providers**: Use Basic for simple metrics, Braintrust for advanced analytics, LLM-as-Judge for subjective evaluation
+- **Define clear criteria**: When using LLM-as-Judge, provide specific evaluation criteria and scales
+- **Validate scoring functions**: Test your scoring functions with known examples before production use
+- **Monitor performance**: Track scoring performance and adjust thresholds based on results
+- **Combine multiple metrics**: Use different scoring providers together for comprehensive evaluation
+
+## Integration with Evaluation
+
+The Scoring API works closely with the [Evaluation](./evaluation.mdx) API to provide comprehensive evaluation workflows:
+
+1. **Datasets** are loaded via the DatasetIO API
+2. **Evaluation** generates model outputs using the Eval API
+3. **Scoring** evaluates the quality of outputs using various scoring functions
+4. **Results** are aggregated and reported for analysis
+
+## Next Steps
+
+- Check out the [Evaluation](./evaluation.mdx) guide for running complete evaluations
+- See the [Building Applications - Evaluation](../building_applications/evals.mdx) guide for application examples
+- Review the [Evaluation Reference](../references/evals_reference/) for comprehensive scoring function usage
+- Explore the [Evaluation Concepts](../concepts/evaluation_concepts) for detailed conceptual information
--- a/docs/source/building_applications/agent.md
+++ b/docs/source/building_applications/agent.md
@ -1,9 +1,18 @@
+---
+title: Agents
+description: Build powerful AI applications with the Llama Stack agent framework
+sidebar_label: Agents
+sidebar_position: 3
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Agents

 An Agent in Llama Stack is a powerful abstraction that allows you to build complex AI applications.

-The Llama Stack agent framework is built on a modular architecture that allows for flexible and powerful AI
-applications. This document explains the key components and how they work together.
+The Llama Stack agent framework is built on a modular architecture that allows for flexible and powerful AI applications. This document explains the key components and how they work together.

 ## Core Concepts

@ -19,7 +28,6 @@ Agents are configured using the `AgentConfig` class, which includes:
 ```python
 from llama_stack_client import Agent

-
 # Create the agent
 agent = Agent(
    llama_stack_client,
@ -46,6 +54,9 @@ Each interaction with an agent is called a "turn" and consists of:
 - **Steps**: The agent's internal processing (inference, tool execution, etc.)
 - **Output Message**: The agent's response

+<Tabs>
+<TabItem value="streaming" label="Streaming Response">
+
 ```python
 from llama_stack_client import AgentEventLogger

@ -57,9 +68,9 @@ turn_response = agent.create_turn(
 for log in AgentEventLogger().log(turn_response):
    log.print()
 ```
-###  Non-Streaming
-

+</TabItem>
+<TabItem value="non-streaming" label="Non-Streaming Response">

 ```python
 from rich.pretty import pprint
@ -78,6 +89,9 @@ print("Steps:")
 pprint(response.steps)
 ```

+</TabItem>
+</Tabs>
+
 ### 4. Steps

 Each turn consists of multiple steps that represent the agent's thought process:
@ -88,5 +102,11 @@ Each turn consists of multiple steps that represent the agent's thought process:

 ## Agent Execution Loop

+Refer to the [Agent Execution Loop](./agent_execution_loop) for more details on what happens within an agent turn.

-Refer to the [Agent Execution Loop](agent_execution_loop) for more details on what happens within an agent turn.
+## Related Resources
+
+- **[Agent Execution Loop](./agent_execution_loop)** - Understanding the internal processing flow
+- **[RAG (Retrieval Augmented Generation)](./rag)** - Building knowledge-enhanced agents
+- **[Tools Integration](./tools)** - Extending agent capabilities with external tools
+- **[Safety Guardrails](./safety)** - Implementing responsible AI practices
--- a/docs/source/building_applications/agent_execution_loop.md
+++ b/docs/source/building_applications/agent_execution_loop.md
@ -1,10 +1,18 @@
-## Agent Execution Loop
+---
+title: Agent Execution Loop
+description: Understanding the internal processing flow of Llama Stack agents
+sidebar_label: Agent Execution Loop
+sidebar_position: 4
+---

-Agents are the heart of Llama Stack applications. They combine inference, memory, safety, and tool usage into coherent
-workflows. At its core, an agent follows a sophisticated execution loop that enables multi-step reasoning, tool usage,
-and safety checks.
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';

-### Steps in the Agent Workflow
+# Agent Execution Loop
+
+Agents are the heart of Llama Stack applications. They combine inference, memory, safety, and tool usage into coherent workflows. At its core, an agent follows a sophisticated execution loop that enables multi-step reasoning, tool usage, and safety checks.
+
+## Steps in the Agent Workflow

 Each agent turn follows these key steps:

@ -17,7 +25,7 @@ Each agent turn follows these key steps:

 3. **Inference Loop**: The agent enters its main execution loop:
   - The LLM receives a user prompt (with previous tool outputs)
-   - The LLM generates a response, potentially with [tool calls](tools)
+   - The LLM generates a response, potentially with [tool calls](./tools)
   - If tool calls are present:
     - Tool inputs are safety-checked
     - Tools are executed (e.g., web search, code execution)
@ -29,7 +37,9 @@ Each agent turn follows these key steps:

 4. **Final Safety Check**: The agent's final response is screened through safety shields

-```{mermaid}
+## Execution Flow Diagram
+
+```mermaid
 sequenceDiagram
    participant U as User
    participant E as Executor
@ -70,12 +80,15 @@ sequenceDiagram

 Each step in this process can be monitored and controlled through configurations.

-### Agent Execution Loop Example
+## Agent Execution Example
+
 Here's an example that demonstrates monitoring the agent's execution:

+<Tabs>
+<TabItem value="streaming" label="Streaming Execution">
+
 ```python
 from llama_stack_client import LlamaStackClient, Agent, AgentEventLogger
-from rich.pretty import pprint

 # Replace host and port
 client = LlamaStackClient(base_url=f"http://{HOST}:{PORT}")
@ -120,6 +133,13 @@ response = agent.create_turn(
 # Monitor each step of execution
 for log in AgentEventLogger().log(response):
    log.print()
+```
+
+</TabItem>
+<TabItem value="non-streaming" label="Non-Streaming Execution">
+
+```python
+from rich.pretty import pprint

 # Using non-streaming API, the response contains input, steps, and output.
 response = agent.create_turn(
@ -131,9 +151,35 @@ response = agent.create_turn(
        }
    ],
    session_id=session_id,
+    stream=False,
 )

 pprint(f"Input: {response.input_messages}")
 pprint(f"Output: {response.output_message.content}")
 pprint(f"Steps: {response.steps}")
 ```
+
+</TabItem>
+</Tabs>
+
+## Key Configuration Options
+
+### Loop Control
+- **max_infer_iters**: Maximum number of inference iterations (default: 5)
+- **max_tokens**: Token limit for responses
+- **temperature**: Controls response randomness
+
+### Safety Configuration
+- **input_shields**: Safety checks for user input
+- **output_shields**: Safety checks for agent responses
+
+### Tool Integration
+- **tools**: List of available tools for the agent
+- **tool_choice**: Control over when tools are used
+
+## Related Resources
+
+- **[Agents](./agent)** - Understanding agent fundamentals
+- **[Tools Integration](./tools)** - Adding capabilities to agents
+- **[Safety Guardrails](./safety)** - Implementing safety measures
+- **[RAG (Retrieval Augmented Generation)](./rag)** - Building knowledge-enhanced workflows
--- a/docs/docs/building_applications/evals.mdx
+++ b/docs/docs/building_applications/evals.mdx
@ -0,0 +1,256 @@
+---
+title: Evaluations
+description: Evaluate LLM applications with Llama Stack's comprehensive evaluation framework
+sidebar_label: Evaluations
+sidebar_position: 7
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+This guide walks you through the process of evaluating an LLM application built using Llama Stack. For detailed API reference, check out the [Evaluation Reference](../references/evals_reference/) guide that covers the complete set of APIs and developer experience flow.
+
+:::tip[Interactive Examples]
+Check out our [Colab notebook](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing) for working examples with evaluations, or try the [Getting Started notebook](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).
+:::
+
+## Application Evaluation Example
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)
+
+Llama Stack offers a library of scoring functions and the `/scoring` API, allowing you to run evaluations on your pre-annotated AI application datasets.
+
+In this example, we will show you how to:
+1. **Build an Agent** with Llama Stack
+2. **Query the agent's sessions, turns, and steps** to analyze execution
+3. **Evaluate the results** using scoring functions
+
+## Step-by-Step Evaluation Process
+
+### 1. Building a Search Agent
+
+First, let's create an agent that can search the web to answer questions:
+
+```python
+from llama_stack_client import LlamaStackClient, Agent, AgentEventLogger
+
+client = LlamaStackClient(base_url=f"http://{HOST}:{PORT}")
+
+agent = Agent(
+    client,
+    model="meta-llama/Llama-3.3-70B-Instruct",
+    instructions="You are a helpful assistant. Use search tool to answer the questions.",
+    tools=["builtin::websearch"],
+)
+
+# Test prompts for evaluation
+user_prompts = [
+    "Which teams played in the NBA Western Conference Finals of 2024. Search the web for the answer.",
+    "In which episode and season of South Park does Bill Cosby (BSM-471) first appear? Give me the number and title. Search the web for the answer.",
+    "What is the British-American kickboxer Andrew Tate's kickboxing name? Search the web for the answer.",
+]
+
+session_id = agent.create_session("test-session")
+
+# Execute all prompts in the session
+for prompt in user_prompts:
+    response = agent.create_turn(
+        messages=[
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ],
+        session_id=session_id,
+    )
+
+    for log in AgentEventLogger().log(response):
+        log.print()
+```
+
+### 2. Query Agent Execution Steps
+
+Now, let's analyze the agent's execution steps to understand its performance:
+
+<Tabs>
+<TabItem value="session-analysis" label="Session Analysis">
+
+```python
+from rich.pretty import pprint
+
+# Query the agent's session to get detailed execution data
+session_response = client.agents.session.retrieve(
+    session_id=session_id,
+    agent_id=agent.agent_id,
+)
+
+pprint(session_response)
+```
+
+</TabItem>
+<TabItem value="tool-validation" label="Tool Usage Validation">
+
+```python
+# Sanity check: Verify that all user prompts are followed by tool calls
+num_tool_call = 0
+for turn in session_response.turns:
+    for step in turn.steps:
+        if (
+            step.step_type == "tool_execution"
+            and step.tool_calls[0].tool_name == "brave_search"
+        ):
+            num_tool_call += 1
+
+print(
+    f"{num_tool_call}/{len(session_response.turns)} user prompts are followed by a tool call to `brave_search`"
+)
+```
+
+</TabItem>
+</Tabs>
+
+### 3. Evaluate Agent Responses
+
+Now we'll evaluate the agent's responses using Llama Stack's scoring API:
+
+<Tabs>
+<TabItem value="data-preparation" label="Data Preparation">
+
+```python
+# Process agent execution history into evaluation rows
+eval_rows = []
+
+# Define expected answers for our test prompts
+expected_answers = [
+    "Dallas Mavericks and the Minnesota Timberwolves",
+    "Season 4, Episode 12",
+    "King Cobra",
+]
+
+# Create evaluation dataset from agent responses
+for i, turn in enumerate(session_response.turns):
+    eval_rows.append(
+        {
+            "input_query": turn.input_messages[0].content,
+            "generated_answer": turn.output_message.content,
+            "expected_answer": expected_answers[i],
+        }
+    )
+
+pprint(eval_rows)
+```
+
+</TabItem>
+<TabItem value="scoring" label="Scoring & Evaluation">
+
+```python
+# Configure scoring parameters
+scoring_params = {
+    "basic::subset_of": None,  # Check if generated answer contains expected answer
+}
+
+# Run evaluation using Llama Stack's scoring API
+scoring_response = client.scoring.score(
+    input_rows=eval_rows,
+    scoring_functions=scoring_params
+)
+
+pprint(scoring_response)
+
+# Analyze results
+for i, result in enumerate(scoring_response.results):
+    print(f"Query {i+1}: {result.score}")
+    print(f"  Generated: {eval_rows[i]['generated_answer'][:100]}...")
+    print(f"  Expected: {expected_answers[i]}")
+    print(f"  Score: {result.score}")
+    print()
+```
+
+</TabItem>
+</Tabs>
+
+## Available Scoring Functions
+
+Llama Stack provides several built-in scoring functions:
+
+### Basic Scoring Functions
+- **`basic::subset_of`**: Checks if the expected answer is contained in the generated response
+- **`basic::exact_match`**: Performs exact string matching between expected and generated answers
+- **`basic::regex_match`**: Uses regular expressions to match patterns in responses
+
+### Advanced Scoring Functions
+- **`llm_as_judge::accuracy`**: Uses an LLM to judge response accuracy
+- **`llm_as_judge::helpfulness`**: Evaluates how helpful the response is
+- **`llm_as_judge::safety`**: Assesses response safety and appropriateness
+
+### Custom Scoring Functions
+You can also create custom scoring functions for domain-specific evaluation needs.
+
+## Evaluation Workflow Best Practices
+
+### 🎯 **Dataset Preparation**
+- Use diverse test cases that cover edge cases and common scenarios
+- Include clear expected answers or success criteria
+- Balance your dataset across different difficulty levels
+
+### 📊 **Metrics Selection**
+- Choose appropriate scoring functions for your use case
+- Combine multiple metrics for comprehensive evaluation
+- Consider both automated and human evaluation metrics
+
+### 🔄 **Iterative Improvement**
+- Run evaluations regularly during development
+- Use evaluation results to identify areas for improvement
+- Track performance changes over time
+
+### 📈 **Analysis & Reporting**
+- Analyze failures to understand model limitations
+- Generate comprehensive evaluation reports
+- Share results with stakeholders for informed decision-making
+
+## Advanced Evaluation Scenarios
+
+### Batch Evaluation
+For evaluating large datasets efficiently:
+
+```python
+# Prepare large evaluation dataset
+large_eval_dataset = [
+    {"input_query": query, "expected_answer": answer}
+    for query, answer in zip(queries, expected_answers)
+]
+
+# Run batch evaluation
+batch_results = client.scoring.score(
+    input_rows=large_eval_dataset,
+    scoring_functions={
+        "basic::subset_of": None,
+        "llm_as_judge::accuracy": {"judge_model": "meta-llama/Llama-3.3-70B-Instruct"},
+    }
+)
+```
+
+### Multi-Metric Evaluation
+Combining different scoring approaches:
+
+```python
+comprehensive_scoring = {
+    "exact_match": "basic::exact_match",
+    "subset_match": "basic::subset_of",
+    "llm_judge": "llm_as_judge::accuracy",
+    "safety_check": "llm_as_judge::safety",
+}
+
+results = client.scoring.score(
+    input_rows=eval_rows,
+    scoring_functions=comprehensive_scoring
+)
+```
+
+## Related Resources
+
+- **[Agents](./agent)** - Building agents for evaluation
+- **[Tools Integration](./tools)** - Using tools in evaluated agents
+- **[Evaluation Reference](../references/evals_reference/)** - Complete API reference for evaluations
+- **[Getting Started Notebook](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)** - Interactive examples
+- **[Evaluation Examples](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing)** - Additional evaluation scenarios
--- a/docs/docs/building_applications/index.mdx
+++ b/docs/docs/building_applications/index.mdx
@ -0,0 +1,83 @@
+---
+title: Building Applications
+description: Comprehensive guides for building AI applications with Llama Stack
+sidebar_label: Overview
+sidebar_position: 5
+---
+
+# AI Application Examples
+
+Llama Stack provides all the building blocks needed to create sophisticated AI applications.
+
+## Getting Started
+
+The best way to get started is to look at this comprehensive notebook which walks through the various APIs (from basic inference, to RAG agents) and how to use them.
+
+**📓 [Building AI Applications Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)**
+
+## Core Topics
+
+Here are the key topics that will help you build effective AI applications:
+
+### 🤖 **Agent Development**
+- **[Agent Framework](./agent.mdx)** - Understand the components and design patterns of the Llama Stack agent framework
+- **[Agent Execution Loop](./agent_execution_loop.mdx)** - How agents process information, make decisions, and execute actions
+- **[Agents vs Responses API](./responses_vs_agents.mdx)** - Learn when to use each API for different use cases
+
+### 📚 **Knowledge Integration**
+- **[RAG (Retrieval-Augmented Generation)](./rag.mdx)** - Enhance your agents with external knowledge through retrieval mechanisms
+
+### 🛠️ **Capabilities & Extensions**
+- **[Tools](./tools.mdx)** - Extend your agents' capabilities by integrating with external tools and APIs
+
+### 📊 **Quality & Monitoring**
+- **[Evaluations](./evals.mdx)** - Evaluate your agents' effectiveness and identify areas for improvement
+- **[Telemetry](./telemetry.mdx)** - Monitor and analyze your agents' performance and behavior
+- **[Safety](./safety.mdx)** - Implement guardrails and safety measures to ensure responsible AI behavior
+
+### 🎮 **Interactive Development**
+- **[Playground](./playground.mdx)** - Interactive environment for testing and developing applications
+
+## Application Patterns
+
+### 🤖 **Conversational Agents**
+Build intelligent chatbots and assistants that can:
+- Maintain context across conversations
+- Access external knowledge bases
+- Execute actions through tool integrations
+- Apply safety filters and guardrails
+
+### 📖 **RAG Applications**
+Create knowledge-augmented applications that:
+- Retrieve relevant information from documents
+- Generate contextually accurate responses
+- Handle large knowledge bases efficiently
+- Provide source attribution
+
+### 🔧 **Tool-Enhanced Systems**
+Develop applications that can:
+- Search the web for real-time information
+- Interact with databases and APIs
+- Perform calculations and analysis
+- Execute complex multi-step workflows
+
+### 🛡️ **Enterprise Applications**
+Build production-ready systems with:
+- Comprehensive safety measures
+- Performance monitoring and analytics
+- Scalable deployment configurations
+- Evaluation and quality assurance
+
+## Next Steps
+
+1. **📖 Start with the Notebook** - Work through the complete tutorial
+2. **🎯 Choose Your Pattern** - Pick the application type that matches your needs
+3. **🏗️ Build Your Foundation** - Set up your [providers](/docs/providers/) and [distributions](/docs/distributions/)
+4. **🚀 Deploy & Monitor** - Use our [deployment guides](/docs/deploying/) for production
+
+## Related Resources
+
+- **[Getting Started](/docs/getting_started/quickstart)** - Basic setup and concepts
+- **[Providers](/docs/providers/)** - Available AI service providers
+- **[Distributions](/docs/distributions/)** - Pre-configured deployment packages
+- **[API Reference](/docs/api/llama-stack-specification)** - Complete API documentation
--- a/docs/docs/building_applications/playground.mdx
+++ b/docs/docs/building_applications/playground.mdx
@ -0,0 +1,299 @@
+---
+title: Llama Stack Playground
+description: Interactive interface to explore and experiment with Llama Stack capabilities
+sidebar_label: Playground
+sidebar_position: 10
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Llama Stack Playground
+
+:::note[Experimental Feature]
+The Llama Stack Playground is currently experimental and subject to change. We welcome feedback and contributions to help improve it.
+:::
+
+The Llama Stack Playground is a simple interface that aims to:
+- **Showcase capabilities and concepts** of Llama Stack in an interactive environment
+- **Demo end-to-end application code** to help users get started building their own applications
+- **Provide a UI** to help users inspect and understand Llama Stack API providers and resources
+
+## Key Features
+
+### Interactive Playground Pages
+
+The playground provides interactive pages for users to explore Llama Stack API capabilities:
+
+#### Chatbot Interface
+
+<video
+  controls
+  autoPlay
+  playsInline
+  muted
+  loop
+  style={{width: '100%'}}
+>
+  <source src="https://github.com/user-attachments/assets/8d2ef802-5812-4a28-96e1-316038c84cbf" type="video/mp4" />
+  Your browser does not support the video tag.
+</video>
+
+<Tabs>
+<TabItem value="chat" label="Chat">
+
+**Simple Chat Interface**
+- Chat directly with Llama models through an intuitive interface
+- Uses the `/inference/chat-completion` streaming API under the hood
+- Real-time message streaming for responsive interactions
+- Perfect for testing model capabilities and prompt engineering
+
+</TabItem>
+<TabItem value="rag" label="RAG Chat">
+
+**Document-Aware Conversations**
+- Upload documents to create memory banks
+- Chat with a RAG-enabled agent that can query your documents
+- Uses Llama Stack's `/agents` API to create and manage RAG sessions
+- Ideal for exploring knowledge-enhanced AI applications
+
+</TabItem>
+</Tabs>
+
+#### Evaluation Interface
+
+<video
+  controls
+  autoPlay
+  playsInline
+  muted
+  loop
+  style={{width: '100%'}}
+>
+  <source src="https://github.com/user-attachments/assets/6cc1659f-eba4-49ca-a0a5-7c243557b4f5" type="video/mp4" />
+  Your browser does not support the video tag.
+</video>
+
+<Tabs>
+<TabItem value="scoring" label="Scoring Evaluations">
+
+**Custom Dataset Evaluation**
+- Upload your own evaluation datasets
+- Run evaluations using available scoring functions
+- Uses Llama Stack's `/scoring` API for flexible evaluation workflows
+- Great for testing application performance on custom metrics
+
+</TabItem>
+<TabItem value="benchmarks" label="Benchmark Evaluations">
+
+<video
+  controls
+  autoPlay
+  playsInline
+  muted
+  loop
+  style={{width: '100%', marginBottom: '1rem'}}
+>
+  <source src="https://github.com/user-attachments/assets/345845c7-2a2b-4095-960a-9ae40f6a93cf" type="video/mp4" />
+  Your browser does not support the video tag.
+</video>
+
+**Pre-registered Evaluation Tasks**
+- Evaluate models or agents on pre-defined tasks
+- Uses Llama Stack's `/eval` API for comprehensive evaluation
+- Combines datasets and scoring functions for standardized testing
+
+**Setup Requirements:**
+Register evaluation datasets and benchmarks first:
+
+```bash
+# Register evaluation dataset
+llama-stack-client datasets register \
+  --dataset-id "mmlu" \
+  --provider-id "huggingface" \
+  --url "https://huggingface.co/datasets/llamastack/evals" \
+  --metadata '{"path": "llamastack/evals", "name": "evals__mmlu__details", "split": "train"}' \
+  --schema '{"input_query": {"type": "string"}, "expected_answer": {"type": "string"}, "chat_completion_input": {"type": "string"}}'
+
+# Register benchmark task
+llama-stack-client benchmarks register \
+  --eval-task-id meta-reference-mmlu \
+  --provider-id meta-reference \
+  --dataset-id mmlu \
+  --scoring-functions basic::regex_parser_multiple_choice_answer
+```
+
+</TabItem>
+</Tabs>
+
+#### Inspection Interface
+
+<video
+  controls
+  autoPlay
+  playsInline
+  muted
+  loop
+  style={{width: '100%'}}
+>
+  <source src="https://github.com/user-attachments/assets/01d52b2d-92af-4e3a-b623-a9b8ba22ba99" type="video/mp4" />
+  Your browser does not support the video tag.
+</video>
+
+<Tabs>
+<TabItem value="providers" label="API Providers">
+
+**Provider Management**
+- Inspect available Llama Stack API providers
+- View provider configurations and capabilities
+- Uses the `/providers` API for real-time provider information
+- Essential for understanding your deployment's capabilities
+
+</TabItem>
+<TabItem value="resources" label="API Resources">
+
+**Resource Exploration**
+- Inspect Llama Stack API resources including:
+  - **Models**: Available language models
+  - **Datasets**: Registered evaluation datasets
+  - **Memory Banks**: Vector databases and knowledge stores
+  - **Benchmarks**: Evaluation tasks and scoring functions
+  - **Shields**: Safety and content moderation tools
+- Uses `/<resources>/list` APIs for comprehensive resource visibility
+- For detailed information about resources, see [Core Concepts](/docs/concepts)
+
+</TabItem>
+</Tabs>
+
+## Getting Started
+
+### Quick Start Guide
+
+<Tabs>
+<TabItem value="setup" label="Setup">
+
+**1. Start the Llama Stack API Server**
+
+```bash
+# Build and run a distribution (example: together)
+llama stack build --distro together --image-type venv
+llama stack run together
+```
+
+**2. Start the Streamlit UI**
+
+```bash
+# Launch the playground interface
+uv run --with ".[ui]" streamlit run llama_stack.core/ui/app.py
+```
+
+</TabItem>
+<TabItem value="usage" label="Usage Tips">
+
+**Making the Most of the Playground:**
+
+- **Start with Chat**: Test basic model interactions and prompt engineering
+- **Explore RAG**: Upload sample documents to see knowledge-enhanced responses
+- **Try Evaluations**: Use the scoring interface to understand evaluation metrics
+- **Inspect Resources**: Check what providers and resources are available
+- **Experiment with Settings**: Adjust parameters to see how they affect results
+
+</TabItem>
+</Tabs>
+
+### Available Distributions
+
+The playground works with any Llama Stack distribution. Popular options include:
+
+<Tabs>
+<TabItem value="together" label="Together AI">
+
+```bash
+llama stack build --distro together --image-type venv
+llama stack run together
+```
+
+**Features:**
+- Cloud-hosted models
+- Fast inference
+- Multiple model options
+
+</TabItem>
+<TabItem value="ollama" label="Ollama (Local)">
+
+```bash
+llama stack build --distro ollama --image-type venv
+llama stack run ollama
+```
+
+**Features:**
+- Local model execution
+- Privacy-focused
+- No internet required
+
+</TabItem>
+<TabItem value="meta-reference" label="Meta Reference">
+
+```bash
+llama stack build --distro meta-reference --image-type venv
+llama stack run meta-reference
+```
+
+**Features:**
+- Reference implementation
+- All API features available
+- Best for development
+
+</TabItem>
+</Tabs>
+
+## Use Cases & Examples
+
+### Educational Use Cases
+- **Learning Llama Stack**: Hands-on exploration of API capabilities
+- **Prompt Engineering**: Interactive testing of different prompting strategies
+- **RAG Experimentation**: Understanding how document retrieval affects responses
+- **Evaluation Understanding**: See how different metrics evaluate model performance
+
+### Development Use Cases
+- **Prototype Testing**: Quick validation of application concepts
+- **API Exploration**: Understanding available endpoints and parameters
+- **Integration Planning**: Seeing how different components work together
+- **Demo Creation**: Showcasing Llama Stack capabilities to stakeholders
+
+### Research Use Cases
+- **Model Comparison**: Side-by-side testing of different models
+- **Evaluation Design**: Understanding how scoring functions work
+- **Safety Testing**: Exploring shield effectiveness with different inputs
+- **Performance Analysis**: Measuring model behavior across different scenarios
+
+## Best Practices
+
+### 🚀 **Getting Started**
+- Begin with simple chat interactions to understand basic functionality
+- Gradually explore more advanced features like RAG and evaluations
+- Use the inspection tools to understand your deployment's capabilities
+
+### 🔧 **Development Workflow**
+- Use the playground to prototype before writing application code
+- Test different parameter settings interactively
+- Validate evaluation approaches before implementing them programmatically
+
+### 📊 **Evaluation & Testing**
+- Start with simple scoring functions before trying complex evaluations
+- Use the playground to understand evaluation results before automation
+- Test safety features with various input types
+
+### 🎯 **Production Preparation**
+- Use playground insights to inform your production API usage
+- Test edge cases and error conditions interactively
+- Validate resource configurations before deployment
+
+## Related Resources
+
+- **[Getting Started Guide](../getting_started/quickstart)** - Complete setup and introduction
+- **[Core Concepts](/docs/concepts)** - Understanding Llama Stack fundamentals
+- **[Agents](./agent)** - Building intelligent agents
+- **[RAG (Retrieval Augmented Generation)](./rag)** - Knowledge-enhanced applications
+- **[Evaluations](./evals)** - Comprehensive evaluation framework
+- **[API Reference](/docs/api/llama-stack-specification)** - Complete API documentation
--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@ -1,36 +1,49 @@
-## Retrieval Augmented Generation (RAG)
+---
+title: Retrieval Augmented Generation (RAG)
+description: Build knowledge-enhanced AI applications with external document retrieval
+sidebar_label: RAG (Retrieval Augmented Generation)
+sidebar_position: 2
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Retrieval Augmented Generation (RAG)

 RAG enables your applications to reference and recall information from previous interactions or external documents.

-Llama Stack organizes the APIs that enable RAG into three layers:
-1. The lowermost APIs deal with raw storage and retrieval. These include Vector IO, KeyValue IO (coming soon) and Relational IO (also coming soon.).
-2. The next is the "Rag Tool", a first-class tool as part of the [Tools API](tools.md) that allows you to ingest documents (from URLs, files, etc) with various chunking strategies and query them smartly.
-3. Finally, it all comes together with the top-level ["Agents" API](agent.md) that allows you to create agents that can use the tools to answer questions, perform tasks, and more.
+## Architecture Overview

-<img src="rag.png" alt="RAG System" width="50%">
+Llama Stack organizes the APIs that enable RAG into three layers:
+
+1. **Lower-Level APIs**: Deal with raw storage and retrieval. These include Vector IO, KeyValue IO (coming soon) and Relational IO (also coming soon)
+2. **RAG Tool**: A first-class tool as part of the [Tools API](./tools) that allows you to ingest documents (from URLs, files, etc) with various chunking strategies and query them smartly
+3. **Agents API**: The top-level [Agents API](./agent) that allows you to create agents that can use the tools to answer questions, perform tasks, and more
+
+![RAG System Architecture](/img/rag.png)

 The RAG system uses lower-level storage for different types of data:
-* **Vector IO**: For semantic search and retrieval
-* **Key-Value and Relational IO**: For structured data storage
+- **Vector IO**: For semantic search and retrieval
+- **Key-Value and Relational IO**: For structured data storage

+:::info[Future Storage Types]
 We may add more storage types like Graph IO in the future.
+:::

-### Setting up Vector DBs
+## Setting up Vector Databases

-For this guide, we will use [Ollama](https://ollama.com/) as the inference provider.
-Ollama is an LLM runtime that allows you to run Llama models locally.
+For this guide, we will use [Ollama](https://ollama.com/) as the inference provider. Ollama is an LLM runtime that allows you to run Llama models locally.

 Here's how to set up a vector database for RAG:

 ```python
-# Create http client
+# Create HTTP client
 import os
 from llama_stack_client import LlamaStackClient

 client = LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")

-
-# Register a vector db
+# Register a vector database
 vector_db_id = "my_documents"
 response = client.vector_dbs.register(
    vector_db_id=vector_db_id,
@ -40,9 +53,15 @@ response = client.vector_dbs.register(
 )
 ```

-### Ingesting Documents
-You can ingest documents into the vector database using two methods: directly inserting pre-chunked
-documents or using the RAG Tool.
+## Document Ingestion
+
+You can ingest documents into the vector database using two methods: directly inserting pre-chunked documents or using the RAG Tool.
+
+### Direct Document Insertion
+
+<Tabs>
+<TabItem value="basic" label="Basic Insertion">
+
 ```python
 # You can insert a pre-chunked document directly into the vector db
 chunks = [
@ -58,10 +77,11 @@ chunks = [
 client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
 ```

-#### Using Precomputed Embeddings
-If you decide to precompute embeddings for your documents, you can insert them directly into the vector database by
-including the embedding vectors in the chunk data. This is useful if you have a separate embedding service or if you
-want to customize the ingestion process.
+</TabItem>
+<TabItem value="embeddings" label="With Precomputed Embeddings">
+
+If you decide to precompute embeddings for your documents, you can insert them directly into the vector database by including the embedding vectors in the chunk data. This is useful if you have a separate embedding service or if you want to customize the ingestion process.
+
 ```python
 chunks_with_embeddings = [
    {
@ -79,44 +99,53 @@ chunks_with_embeddings = [
 ]
 client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks_with_embeddings)
 ```
-When providing precomputed embeddings, ensure the embedding dimension matches the embedding_dimension specified when
-registering the vector database.

-### Retrieval
+:::warning[Embedding Dimensions]
+When providing precomputed embeddings, ensure the embedding dimension matches the `embedding_dimension` specified when registering the vector database.
+:::
+
+</TabItem>
+</Tabs>
+
+### Document Retrieval
+
 You can query the vector database to retrieve documents based on their embeddings.
+
 ```python
 # You can then query for these chunks
 chunks_response = client.vector_io.query(
-    vector_db_id=vector_db_id, query="What do you know about..."
+    vector_db_id=vector_db_id,
+    query="What do you know about..."
 )
 ```

-### Using the RAG Tool
+## Using the RAG Tool

-> **⚠️ DEPRECATION NOTICE**: The RAG Tool is being deprecated in favor of directly using the OpenAI-compatible Search
-> API. We recommend migrating to the OpenAI APIs for better compatibility and future support.
+:::danger[Deprecation Notice]
+The RAG Tool is being deprecated in favor of directly using the OpenAI-compatible Search API. We recommend migrating to the OpenAI APIs for better compatibility and future support.
+:::

-A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc.
-and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the
-[appendix](#more-ragdocument-examples).
+A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc. and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the [appendix](#more-ragdocument-examples).

-#### OpenAI API Integration & Migration
+### OpenAI API Integration & Migration

 The RAG tool has been updated to use OpenAI-compatible APIs. This provides several benefits:

 - **Files API Integration**: Documents are now uploaded using OpenAI's file upload endpoints
 - **Vector Stores API**: Vector storage operations use OpenAI's vector store format with configurable chunking strategies
- **Error Resilience:** When processing multiple documents, individual failures are logged but don't crash the operation. Failed documents are skipped while successful ones continue processing.
+- **Error Resilience**: When processing multiple documents, individual failures are logged but don't crash the operation. Failed documents are skipped while successful ones continue processing.
+
+### Migration Path

-**Migration Path:**
 We recommend migrating to the OpenAI-compatible Search API for:
-1. **Better OpenAI Ecosystem Integration**: Direct compatibility with OpenAI tools and workflows including the Responses API
-2**Future-Proof**: Continued support and feature development
-3**Full OpenAI Compatibility**: Vector Stores, Files, and Search APIs are fully compatible with OpenAI's Responses API

-The OpenAI APIs are used under the hood, so you can continue to use your existing RAG Tool code with minimal changes.
-However, we recommend updating your code to use the new OpenAI-compatible APIs for better long-term support. If any
-documents  fail to process, they will be logged in the response but will not cause the entire operation to fail.
+1. **Better OpenAI Ecosystem Integration**: Direct compatibility with OpenAI tools and workflows including the Responses API
+2. **Future-Proof**: Continued support and feature development
+3. **Full OpenAI Compatibility**: Vector Stores, Files, and Search APIs are fully compatible with OpenAI's Responses API
+
+The OpenAI APIs are used under the hood, so you can continue to use your existing RAG Tool code with minimal changes. However, we recommend updating your code to use the new OpenAI-compatible APIs for better long-term support. If any documents fail to process, they will be logged in the response but will not cause the entire operation to fail.
+
+### RAG Tool Example

 ```python
 from llama_stack_client import RAGDocument
@ -145,9 +174,12 @@ results = client.tool_runtime.rag_tool.query(
 )
 ```

-You can configure how the RAG tool adds metadata to the context if you find it useful for your application. Simply add:
+### Custom Context Configuration
+
+You can configure how the RAG tool adds metadata to the context if you find it useful for your application:
+
 ```python
-# Query documents
+# Query documents with custom template
 results = client.tool_runtime.rag_tool.query(
    vector_db_ids=[vector_db_id],
    content="What do you know about...",
@ -156,10 +188,13 @@ results = client.tool_runtime.rag_tool.query(
    },
 )
 ```
-### Building RAG-Enhanced Agents
+
+## Building RAG-Enhanced Agents

 One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:

+### Agent with Knowledge Search
+
 ```python
 from llama_stack_client import Agent

@ -185,7 +220,6 @@ agent = Agent(
 )
 session_id = agent.create_session("rag_session")

-
 # Ask questions about documents in the vector db, and the agent will query the db to answer the question.
 response = agent.create_turn(
    messages=[{"role": "user", "content": "How to optimize memory in PyTorch?"}],
@ -193,10 +227,14 @@ response = agent.create_turn(
 )
 ```

-> **NOTE:** the `instructions` field in the `AgentConfig` can be used to guide the agent's behavior. It is important to experiment with different instructions to see what works best for your use case.
+:::tip[Agent Instructions]
+The `instructions` field in the `AgentConfig` can be used to guide the agent's behavior. It is important to experiment with different instructions to see what works best for your use case.
+:::

+### Document-Aware Conversations
+
+You can also pass documents along with the user's message and ask questions about them:

-You can also pass documents along with the user's message and ask questions about them.
 ```python
 # Initial document ingestion
 response = agent.create_turn(
@ -219,7 +257,10 @@ response = agent.create_turn(
 )
 ```

-You can print the response with below.
+### Viewing Agent Responses
+
+You can print the response with the following:
+
 ```python
 from llama_stack_client import AgentEventLogger

@ -227,32 +268,74 @@ for log in AgentEventLogger().log(response):
    log.print()
 ```

+## Vector Database Management
+
 ### Unregistering Vector DBs

 If you need to clean up and unregister vector databases, you can do so as follows:

+<Tabs>
+<TabItem value="single" label="Single Database">
+
 ```python
 # Unregister a specified vector database
 vector_db_id = "my_vector_db_id"
 print(f"Unregistering vector database: {vector_db_id}")
 client.vector_dbs.unregister(vector_db_id=vector_db_id)
+```

+</TabItem>
+<TabItem value="all" label="All Databases">

+```python
 # Unregister all vector databases
 for vector_db_id in client.vector_dbs.list():
    print(f"Unregistering vector database: {vector_db_id.identifier}")
    client.vector_dbs.unregister(vector_db_id=vector_db_id.identifier)
 ```

-### Appendix
+</TabItem>
+</Tabs>
+
+## Best Practices
+
+### 🎯 **Document Chunking**
+- Use appropriate chunk sizes (512 tokens is often a good starting point)
+- Consider overlap between chunks for better context preservation
+- Experiment with different chunking strategies for your content type
+
+### 🔍 **Embedding Strategy**
+- Choose embedding models that match your domain
+- Consider the trade-off between embedding dimension and performance
+- Test different embedding models for your specific use case
+
+### 📊 **Query Optimization**
+- Use specific, well-formed queries for better retrieval
+- Experiment with different search strategies
+- Consider hybrid approaches (keyword + semantic search)
+
+### 🛡️ **Error Handling**
+- Implement proper error handling for failed document processing
+- Monitor ingestion success rates
+- Have fallback strategies for retrieval failures
+
+## Appendix
+
+### More RAGDocument Examples
+
+Here are various ways to create RAGDocument objects for different content types:

-#### More RAGDocument Examples
 ```python
 from llama_stack_client import RAGDocument
 import base64

+# File URI
 RAGDocument(document_id="num-0", content={"uri": "file://path/to/file"})
+
+# Plain text
 RAGDocument(document_id="num-1", content="plain text")
+
+# Explicit text input
 RAGDocument(
    document_id="num-2",
    content={
@ -260,6 +343,8 @@ RAGDocument(
        "text": "plain text input",
    },  # for inputs that should be treated as text explicitly
 )
+
+# Image from URL
 RAGDocument(
    document_id="num-3",
    content={
@ -267,14 +352,16 @@ RAGDocument(
        "image": {"url": {"uri": "https://mywebsite.com/image.jpg"}},
    },
 )
+
+# Base64 encoded image
 B64_ENCODED_IMAGE = base64.b64encode(
    requests.get(
        "https://raw.githubusercontent.com/meta-llama/llama-stack/refs/heads/main/docs/_static/llama-stack.png"
    ).content
 )
-RAGDocuemnt(
+RAGDocument(
    document_id="num-4",
    content={"type": "image", "image": {"data": B64_ENCODED_IMAGE}},
 )
 ```
-for more strongly typed interaction use the typed dicts found [here](https://github.com/meta-llama/llama-stack-client-python/blob/38cd91c9e396f2be0bec1ee96a19771582ba6f17/src/llama_stack_client/types/shared_params/document.py).
+For more strongly typed interaction use the typed dicts found [here](https://github.com/meta-llama/llama-stack-client-python/blob/38cd91c9e396f2be0bec1ee96a19771582ba6f17/src/llama_stack_client/types/shared_params/document.py).
--- a/docs/source/building_applications/responses_vs_agents.md
+++ b/docs/source/building_applications/responses_vs_agents.md
@ -1,10 +1,20 @@
+---
+title: Agents vs OpenAI Responses API
+description: Compare the Agents API and OpenAI Responses API for building AI applications with tool calling capabilities
+sidebar_label: Agents vs Responses API
+sidebar_position: 5
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Agents vs OpenAI Responses API

 Llama Stack (LLS) provides two different APIs for building AI applications with tool calling capabilities: the **Agents API** and the **OpenAI Responses API**. While both enable AI systems to use tools, and maintain full conversation history, they serve different use cases and have distinct characteristics.

-```{note}
- **Note:** For simple and basic inferencing, you may want to use the [Chat Completions API](../providers/openai.md#chat-completions) directly, before progressing to Agents or Responses API.
-```
+:::note
+**Note:** For simple and basic inferencing, you may want to use the [Chat Completions API](../providers/openai#chat-completions) directly, before progressing to Agents or Responses API.
+:::

 ## Overview

@ -21,6 +31,8 @@ Additionally, Agents let you specify input/output shields whereas Responses do n

 Today the Agents and Responses APIs can be used independently depending on the use case. But, it is also productive to treat the APIs as complementary. It is not currently supported, but it is planned for the LLS Agents API to alternatively use the Responses API as its backend instead of the default Chat Completions API, i.e., enabling a combination of the safety features of Agents with the dynamic configuration and branching capabilities of Responses.

+## Feature Comparison
+
 | Feature | LLS Agents API | OpenAI Responses API |
 |---------|------------|---------------------|
 | **Conversation Management** | Linear persistent sessions | Can branch from any previous response ID |
@ -34,7 +46,10 @@ Let's compare how both APIs handle a research task where we need to:
 2. Access different information sources dynamically
 3. Continue the conversation based on search results

-### Agents API: Session-based configuration with safety shields
+<Tabs>
+<TabItem value="agents" label="Agents API">
+
+### Session-based Configuration with Safety Shields

 ```python
 # Create agent with static session configuration
@ -85,7 +100,10 @@ print(f"First result: {response1.output_message.content}")
 print(f"Optimization: {response2.output_message.content}")
 ```

-### Responses API: Dynamic per-call configuration with branching
+</TabItem>
+<TabItem value="responses" label="Responses API">
+
+### Dynamic Per-call Configuration with Branching

 ```python
 # First response: Use web search for latest algorithms
@ -130,50 +148,74 @@ print(f"File search results: {response2.output_message.content}")
 print(f"Alternative web search: {response3.output_message.content}")
 ```

+</TabItem>
+</Tabs>
+
 Both APIs demonstrate distinct strengths that make them valuable on their own for different scenarios. The Agents API excels in providing structured, safety-conscious workflows with persistent session management, while the Responses API offers flexibility through dynamic configuration and OpenAI compatible tool patterns.

 ## Use Case Examples

-### 1. **Research and Analysis with Safety Controls**
+### 1. Research and Analysis with Safety Controls
 **Best Choice: Agents API**

 **Scenario:** You're building a research assistant for a financial institution that needs to analyze market data, execute code to process financial models, and search through internal compliance documents. The system must ensure all interactions are logged for regulatory compliance and protected by safety shields to prevent malicious code execution or data leaks.

 **Why Agents API?** The Agents API provides persistent session management for iterative research workflows, built-in safety shields to protect against malicious code in financial models, and structured execution logs (session/turn/step) required for regulatory compliance. The static tool configuration ensures consistent access to your knowledge base and code interpreter throughout the entire research session.

-### 2. **Dynamic Information Gathering with Branching Exploration**
+### 2. Dynamic Information Gathering with Branching Exploration
 **Best Choice: Responses API**

 **Scenario:** You're building a competitive intelligence tool that helps businesses research market trends. Users need to dynamically switch between web search for current market data and file search through uploaded industry reports. They also want to branch conversations to explore different market segments simultaneously and experiment with different models for various analysis types.

 **Why Responses API?** The Responses API's branching capability lets users explore multiple market segments from any research point. Dynamic per-call configuration allows switching between web search and file search as needed, while experimenting with different models (faster models for quick searches, more powerful models for deep analysis). The OpenAI-compatible tool patterns make integration straightforward.

-### 3. **OpenAI Migration with Advanced Tool Capabilities**
+### 3. OpenAI Migration with Advanced Tool Capabilities
 **Best Choice: Responses API**

 **Scenario:** You have an existing application built with OpenAI's Assistants API that uses file search and web search capabilities. You want to migrate to Llama Stack for better performance and cost control while maintaining the same tool calling patterns and adding new capabilities like dynamic vector store selection.

 **Why Responses API?** The Responses API provides full OpenAI tool compatibility (`web_search`, `file_search`) with identical syntax, making migration seamless. The dynamic per-call configuration enables advanced features like switching vector stores per query or changing models based on query complexity - capabilities that extend beyond basic OpenAI functionality while maintaining compatibility.

-### 4. **Educational Programming Tutor**
+### 4. Educational Programming Tutor
 **Best Choice: Agents API**

 **Scenario:** You're building a programming tutor that maintains student context across multiple sessions, safely executes code exercises, and tracks learning progress with audit trails for educators.

 **Why Agents API?** Persistent sessions remember student progress across multiple interactions, safety shields prevent malicious code execution while allowing legitimate programming exercises, and structured execution logs help educators track learning patterns.

-### 5. **Advanced Software Debugging Assistant**
+### 5. Advanced Software Debugging Assistant
 **Best Choice: Agents API with Responses Backend**

 **Scenario:** You're building a debugging assistant that helps developers troubleshoot complex issues. It needs to maintain context throughout a debugging session, safely execute diagnostic code, switch between different analysis tools dynamically, and branch conversations to explore multiple potential causes simultaneously.

 **Why Agents + Responses?** The Agent provides safety shields for code execution and session management for the overall debugging workflow. The underlying Responses API enables dynamic model selection and flexible tool configuration per query, while branching lets you explore different theories (memory leak vs. concurrency issue) from the same debugging point and compare results.

-> **Note:** The ability to use Responses API as the backend for Agents is not yet implemented but is planned for a future release. Currently, Agents use Chat Completions API as their backend by default.
+:::info[Future Enhancement]
+The ability to use Responses API as the backend for Agents is not yet implemented but is planned for a future release. Currently, Agents use Chat Completions API as their backend by default.
+:::

-## For More Information
+## Decision Framework

- **LLS Agents API**: For detailed information on creating and managing agents, see the [Agents documentation](agent.md)
- **OpenAI Responses API**: For information on using the OpenAI-compatible responses API, see the [OpenAI API documentation](https://platform.openai.com/docs/api-reference/responses)
- **Chat Completions API**: For the default backend API used by Agents, see the [Chat Completions providers documentation](../providers/openai.md#chat-completions)
- **Agent Execution Loop**: For understanding how agents process turns and steps in their execution, see the [Agent Execution Loop documentation](agent_execution_loop.md)
+Use this framework to choose the right API for your use case:
+
+### Choose Agents API when:
+- ✅ You need **safety shields** for input/output validation
+- ✅ Your application requires **linear conversation flow** with persistent context
+- ✅ You need **audit trails** and structured execution logs
+- ✅ Your tool configuration is **static** throughout the session
+- ✅ You're building **educational, financial, or enterprise** applications with compliance requirements
+
+### Choose Responses API when:
+- ✅ You need **conversation branching** to explore multiple paths
+- ✅ You want **dynamic per-call configuration** (models, tools, vector stores)
+- ✅ You're **migrating from OpenAI** and want familiar tool patterns
+- ✅ You need **OpenAI compatibility** for existing workflows
+- ✅ Your application benefits from **flexible, experimental** interactions
+
+## Related Resources
+
+- **[Agents](./agent)** - Understanding the Agents API fundamentals
+- **[Agent Execution Loop](./agent_execution_loop)** - How agents process turns and steps
+- **[Tools Integration](./tools)** - Adding capabilities to both APIs
+- **[OpenAI Compatibility](../providers/openai)** - Using OpenAI-compatible endpoints
+- **[Safety Guardrails](./safety)** - Implementing safety measures in agents
--- a/docs/docs/building_applications/safety.mdx
+++ b/docs/docs/building_applications/safety.mdx
@ -0,0 +1,395 @@
+---
+title: Safety Guardrails
+description: Implement safety measures and content moderation in Llama Stack applications
+sidebar_label: Safety
+sidebar_position: 9
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Safety Guardrails
+
+Safety is a critical component of any AI application. Llama Stack provides a comprehensive Shield system that can be applied at multiple touchpoints to ensure responsible AI behavior and content moderation.
+
+## Shield System Overview
+
+The Shield system in Llama Stack provides:
+- **Content filtering** for both input and output messages
+- **Multi-touchpoint protection** across your application flow
+- **Configurable safety policies** tailored to your use case
+- **Integration with agents** for automated safety enforcement
+
+## Basic Shield Usage
+
+### Registering a Safety Shield
+
+<Tabs>
+<TabItem value="registration" label="Shield Registration">
+
+```python
+# Register a safety shield
+shield_id = "content_safety"
+client.shields.register(
+    shield_id=shield_id,
+    provider_shield_id="llama-guard-basic"
+)
+```
+
+</TabItem>
+<TabItem value="manual-check" label="Manual Safety Check">
+
+```python
+# Run content through shield manually
+response = client.safety.run_shield(
+    shield_id=shield_id,
+    messages=[{"role": "user", "content": "User message here"}]
+)
+
+if response.violation:
+    print(f"Safety violation detected: {response.violation.user_message}")
+    # Handle violation appropriately
+else:
+    print("Content passed safety checks")
+```
+
+</TabItem>
+</Tabs>
+
+## Agent Integration
+
+Shields can be automatically applied to agent interactions for seamless safety enforcement:
+
+<Tabs>
+<TabItem value="input-shields" label="Input Shields">
+
+```python
+from llama_stack_client import Agent
+
+# Create agent with input safety shields
+agent = Agent(
+    client,
+    model="meta-llama/Llama-3.2-3B-Instruct",
+    instructions="You are a helpful assistant",
+    input_shields=["content_safety"],  # Shield user inputs
+    tools=["builtin::websearch"],
+)
+
+session_id = agent.create_session("safe_session")
+
+# All user inputs will be automatically screened
+response = agent.create_turn(
+    messages=[{"role": "user", "content": "Tell me about AI safety"}],
+    session_id=session_id,
+)
+```
+
+</TabItem>
+<TabItem value="output-shields" label="Output Shields">
+
+```python
+# Create agent with output safety shields
+agent = Agent(
+    client,
+    model="meta-llama/Llama-3.2-3B-Instruct",
+    instructions="You are a helpful assistant",
+    output_shields=["content_safety"],  # Shield agent outputs
+    tools=["builtin::websearch"],
+)
+
+session_id = agent.create_session("safe_session")
+
+# All agent responses will be automatically screened
+response = agent.create_turn(
+    messages=[{"role": "user", "content": "Help me with my research"}],
+    session_id=session_id,
+)
+```
+
+</TabItem>
+<TabItem value="both-shields" label="Input & Output Shields">
+
+```python
+# Create agent with comprehensive safety coverage
+agent = Agent(
+    client,
+    model="meta-llama/Llama-3.2-3B-Instruct",
+    instructions="You are a helpful assistant",
+    input_shields=["content_safety"],   # Screen user inputs
+    output_shields=["content_safety"],  # Screen agent outputs
+    tools=["builtin::websearch"],
+)
+
+session_id = agent.create_session("fully_protected_session")
+
+# Both input and output are automatically protected
+response = agent.create_turn(
+    messages=[{"role": "user", "content": "Research question here"}],
+    session_id=session_id,
+)
+```
+
+</TabItem>
+</Tabs>
+
+## Available Shield Types
+
+### Llama Guard Shields
+
+Llama Guard provides state-of-the-art content safety classification:
+
+<Tabs>
+<TabItem value="basic" label="Basic Llama Guard">
+
+```python
+# Basic Llama Guard for general content safety
+client.shields.register(
+    shield_id="llama_guard_basic",
+    provider_shield_id="llama-guard-basic"
+)
+```
+
+**Use Cases:**
+- General content moderation
+- Harmful content detection
+- Basic safety compliance
+
+</TabItem>
+<TabItem value="advanced" label="Advanced Llama Guard">
+
+```python
+# Advanced Llama Guard with custom categories
+client.shields.register(
+    shield_id="llama_guard_advanced",
+    provider_shield_id="llama-guard-advanced",
+    config={
+        "categories": [
+            "violence", "hate_speech", "sexual_content",
+            "self_harm", "illegal_activity"
+        ],
+        "threshold": 0.8
+    }
+)
+```
+
+**Use Cases:**
+- Fine-tuned safety policies
+- Domain-specific content filtering
+- Enterprise compliance requirements
+
+</TabItem>
+</Tabs>
+
+### Custom Safety Shields
+
+Create domain-specific safety shields for specialized use cases:
+
+```python
+# Register custom safety shield
+client.shields.register(
+    shield_id="financial_compliance",
+    provider_shield_id="custom-financial-shield",
+    config={
+        "detect_pii": True,
+        "financial_advice_warning": True,
+        "regulatory_compliance": "FINRA"
+    }
+)
+```
+
+## Safety Response Handling
+
+When safety violations are detected, handle them appropriately:
+
+<Tabs>
+<TabItem value="basic-handling" label="Basic Handling">
+
+```python
+response = client.safety.run_shield(
+    shield_id="content_safety",
+    messages=[{"role": "user", "content": "Potentially harmful content"}]
+)
+
+if response.violation:
+    violation = response.violation
+    print(f"Violation Type: {violation.violation_type}")
+    print(f"User Message: {violation.user_message}")
+    print(f"Metadata: {violation.metadata}")
+
+    # Log the violation for audit purposes
+    logger.warning(f"Safety violation detected: {violation.violation_type}")
+
+    # Provide appropriate user feedback
+    return "I can't help with that request. Please try asking something else."
+```
+
+</TabItem>
+<TabItem value="advanced-handling" label="Advanced Handling">
+
+```python
+def handle_safety_response(safety_response, user_message):
+    """Advanced safety response handling with logging and user feedback"""
+
+    if not safety_response.violation:
+        return {"safe": True, "message": "Content passed safety checks"}
+
+    violation = safety_response.violation
+
+    # Log violation details
+    audit_log = {
+        "timestamp": datetime.now().isoformat(),
+        "violation_type": violation.violation_type,
+        "original_message": user_message,
+        "shield_response": violation.user_message,
+        "metadata": violation.metadata
+    }
+    logger.warning(f"Safety violation: {audit_log}")
+
+    # Determine appropriate response based on violation type
+    if violation.violation_type == "hate_speech":
+        user_feedback = "I can't engage with content that contains hate speech. Let's keep our conversation respectful."
+    elif violation.violation_type == "violence":
+        user_feedback = "I can't provide information that could promote violence. How else can I help you today?"
+    else:
+        user_feedback = "I can't help with that request. Please try asking something else."
+
+    return {
+        "safe": False,
+        "user_feedback": user_feedback,
+        "violation_details": audit_log
+    }
+
+# Usage
+safety_result = handle_safety_response(response, user_input)
+if not safety_result["safe"]:
+    return safety_result["user_feedback"]
+```
+
+</TabItem>
+</Tabs>
+
+## Safety Configuration Best Practices
+
+### 🛡️ **Multi-Layer Protection**
+- Use both input and output shields for comprehensive coverage
+- Combine multiple shield types for different threat categories
+- Implement fallback mechanisms when shields fail
+
+### 📊 **Monitoring & Auditing**
+- Log all safety violations for compliance and analysis
+- Monitor false positive rates to tune shield sensitivity
+- Track safety metrics across different use cases
+
+### ⚙️ **Configuration Management**
+- Use environment-specific safety configurations
+- Implement A/B testing for shield effectiveness
+- Regularly update shield models and policies
+
+### 🔧 **Integration Patterns**
+- Integrate shields early in the development process
+- Test safety measures with adversarial inputs
+- Provide clear user feedback for violations
+
+## Advanced Safety Scenarios
+
+### Context-Aware Safety
+
+```python
+# Safety shields that consider conversation context
+agent = Agent(
+    client,
+    model="meta-llama/Llama-3.2-3B-Instruct",
+    instructions="You are a healthcare assistant",
+    input_shields=["medical_safety"],
+    output_shields=["medical_safety"],
+    # Context helps shields make better decisions
+    safety_context={
+        "domain": "healthcare",
+        "user_type": "patient",
+        "compliance_level": "HIPAA"
+    }
+)
+```
+
+### Dynamic Shield Selection
+
+```python
+def select_shield_for_user(user_profile):
+    """Select appropriate safety shield based on user context"""
+    if user_profile.age < 18:
+        return "child_safety_shield"
+    elif user_profile.context == "enterprise":
+        return "enterprise_compliance_shield"
+    else:
+        return "general_safety_shield"
+
+# Use dynamic shield selection
+shield_id = select_shield_for_user(current_user)
+response = client.safety.run_shield(
+    shield_id=shield_id,
+    messages=messages
+)
+```
+
+## Compliance and Regulations
+
+### Industry-Specific Safety
+
+<Tabs>
+<TabItem value="healthcare" label="Healthcare (HIPAA)">
+
+```python
+# Healthcare-specific safety configuration
+client.shields.register(
+    shield_id="hipaa_compliance",
+    provider_shield_id="healthcare-safety-shield",
+    config={
+        "detect_phi": True,  # Protected Health Information
+        "medical_advice_warning": True,
+        "regulatory_framework": "HIPAA"
+    }
+)
+```
+
+</TabItem>
+<TabItem value="financial" label="Financial (FINRA)">
+
+```python
+# Financial services safety configuration
+client.shields.register(
+    shield_id="finra_compliance",
+    provider_shield_id="financial-safety-shield",
+    config={
+        "detect_financial_advice": True,
+        "investment_disclaimers": True,
+        "regulatory_framework": "FINRA"
+    }
+)
+```
+
+</TabItem>
+<TabItem value="education" label="Education (COPPA)">
+
+```python
+# Educational platform safety for minors
+client.shields.register(
+    shield_id="coppa_compliance",
+    provider_shield_id="educational-safety-shield",
+    config={
+        "child_protection": True,
+        "educational_content_only": True,
+        "regulatory_framework": "COPPA"
+    }
+)
+```
+
+</TabItem>
+</Tabs>
+
+## Related Resources
+
+- **[Agents](./agent)** - Integrating safety shields with intelligent agents
+- **[Agent Execution Loop](./agent_execution_loop)** - Understanding safety in the execution flow
+- **[Evaluations](./evals)** - Evaluating safety shield effectiveness
+- **[Telemetry](./telemetry)** - Monitoring safety violations and metrics
+- **[Llama Guard Documentation](https://github.com/meta-llama/PurpleLlama/tree/main/Llama-Guard3)** - Advanced safety model details
--- a/docs/docs/building_applications/telemetry.mdx
+++ b/docs/docs/building_applications/telemetry.mdx
@ -0,0 +1,342 @@
+---
+title: Telemetry
+description: Monitor and observe Llama Stack applications with comprehensive telemetry capabilities
+sidebar_label: Telemetry
+sidebar_position: 8
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Telemetry
+
+The Llama Stack telemetry system provides comprehensive tracing, metrics, and logging capabilities. It supports multiple sink types including OpenTelemetry, SQLite, and Console output for complete observability of your AI applications.
+
+## Event Types
+
+The telemetry system supports three main types of events:
+
+<Tabs>
+<TabItem value="unstructured" label="Unstructured Logs">
+
+Free-form log messages with severity levels for general application logging:
+
+```python
+unstructured_log_event = UnstructuredLogEvent(
+    message="This is a log message",
+    severity=LogSeverity.INFO
+)
+```
+
+</TabItem>
+<TabItem value="metrics" label="Metric Events">
+
+Numerical measurements with units for tracking performance and usage:
+
+```python
+metric_event = MetricEvent(
+    metric="my_metric",
+    value=10,
+    unit="count"
+)
+```
+
+</TabItem>
+<TabItem value="structured" label="Structured Logs">
+
+System events like span start/end that provide structured operation tracking:
+
+```python
+structured_log_event = SpanStartPayload(
+    name="my_span",
+    parent_span_id="parent_span_id"
+)
+```
+
+</TabItem>
+</Tabs>
+
+## Spans and Traces
+
+- **Spans**: Represent individual operations with timing information and hierarchical relationships
+- **Traces**: Collections of related spans that form a complete request flow across your application
+
+This hierarchical structure allows you to understand the complete execution path of requests through your Llama Stack application.
+
+## Automatic Metrics Generation
+
+Llama Stack automatically generates metrics during inference operations. These metrics are aggregated at the **inference request level** and provide insights into token usage and model performance.
+
+### Available Metrics
+
+The following metrics are automatically generated for each inference request:
+
+| Metric Name | Type | Unit | Description | Labels |
+|-------------|------|------|-------------|--------|
+| `llama_stack_prompt_tokens_total` | Counter | `tokens` | Number of tokens in the input prompt | `model_id`, `provider_id` |
+| `llama_stack_completion_tokens_total` | Counter | `tokens` | Number of tokens in the generated response | `model_id`, `provider_id` |
+| `llama_stack_tokens_total` | Counter | `tokens` | Total tokens used (prompt + completion) | `model_id`, `provider_id` |
+
+### Metric Generation Flow
+
+1. **Token Counting**: During inference operations (chat completion, completion, etc.), the system counts tokens in both input prompts and generated responses
+2. **Metric Construction**: For each request, `MetricEvent` objects are created with the token counts
+3. **Telemetry Logging**: Metrics are sent to the configured telemetry sinks
+4. **OpenTelemetry Export**: When OpenTelemetry is enabled, metrics are exposed as standard OpenTelemetry counters
+
+### Metric Aggregation Level
+
+All metrics are generated and aggregated at the **inference request level**. This means:
+
+- Each individual inference request generates its own set of metrics
+- Metrics are not pre-aggregated across multiple requests
+- Aggregation (sums, averages, etc.) can be performed by your observability tools (Prometheus, Grafana, etc.)
+- Each metric includes labels for `model_id` and `provider_id` to enable filtering and grouping
+
+### Example Metric Event
+
+```python
+MetricEvent(
+    trace_id="1234567890abcdef",
+    span_id="abcdef1234567890",
+    metric="total_tokens",
+    value=150,
+    timestamp=1703123456.789,
+    unit="tokens",
+    attributes={
+        "model_id": "meta-llama/Llama-3.2-3B-Instruct",
+        "provider_id": "tgi"
+    },
+)
+```
+
+## Telemetry Sinks
+
+Choose from multiple sink types based on your observability needs:
+
+<Tabs>
+<TabItem value="opentelemetry" label="OpenTelemetry">
+
+Send events to an OpenTelemetry Collector for integration with observability platforms:
+
+**Use Cases:**
+- Visualizing traces in tools like Jaeger
+- Collecting metrics for Prometheus
+- Integration with enterprise observability stacks
+
+**Features:**
+- Standard OpenTelemetry format
+- Compatible with all OpenTelemetry collectors
+- Supports both traces and metrics
+
+</TabItem>
+<TabItem value="sqlite" label="SQLite">
+
+Store events in a local SQLite database for direct querying:
+
+**Use Cases:**
+- Local development and debugging
+- Custom analytics and reporting
+- Offline analysis of application behavior
+
+**Features:**
+- Direct SQL querying capabilities
+- Persistent local storage
+- No external dependencies
+
+</TabItem>
+<TabItem value="console" label="Console">
+
+Print events to the console for immediate debugging:
+
+**Use Cases:**
+- Development and testing
+- Quick debugging sessions
+- Simple logging without external tools
+
+**Features:**
+- Immediate output visibility
+- No setup required
+- Human-readable format
+
+</TabItem>
+</Tabs>
+
+## Configuration
+
+### Meta-Reference Provider
+
+Currently, only the meta-reference provider is implemented. It can be configured to send events to multiple sink types:
+
+```yaml
+telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: "llama-stack-service"
+      sinks: ['console', 'sqlite', 'otel_trace', 'otel_metric']
+      otel_exporter_otlp_endpoint: "http://localhost:4318"
+      sqlite_db_path: "/path/to/telemetry.db"
+```
+
+### Environment Variables
+
+Configure telemetry behavior using environment variables:
+
+- **`OTEL_EXPORTER_OTLP_ENDPOINT`**: OpenTelemetry Collector endpoint (default: `http://localhost:4318`)
+- **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string)
+- **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `console,sqlite`)
+
+## Visualization with Jaeger
+
+The `otel_trace` sink works with any service compatible with the OpenTelemetry collector. Traces and metrics use separate endpoints but can share the same collector.
+
+### Starting Jaeger
+
+Start a Jaeger instance with OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686:
+
+```bash
+docker run --pull always --rm --name jaeger \
+  -p 16686:16686 -p 4318:4318 \
+  jaegertracing/jaeger:2.1.0
+```
+
+Once running, you can visualize traces by navigating to [http://localhost:16686/](http://localhost:16686/).
+
+## Querying Metrics
+
+When using the OpenTelemetry sink, metrics are exposed in standard format and can be queried through various tools:
+
+<Tabs>
+<TabItem value="prometheus" label="Prometheus Queries">
+
+Example Prometheus queries for analyzing token usage:
+
+```promql
+# Total tokens used across all models
+sum(llama_stack_tokens_total)
+
+# Tokens per model
+sum by (model_id) (llama_stack_tokens_total)
+
+# Average tokens per request over 5 minutes
+rate(llama_stack_tokens_total[5m])
+
+# Token usage by provider
+sum by (provider_id) (llama_stack_tokens_total)
+```
+
+</TabItem>
+<TabItem value="grafana" label="Grafana Dashboards">
+
+Create dashboards using Prometheus as a data source:
+
+- **Token Usage Over Time**: Line charts showing token consumption trends
+- **Model Performance**: Comparison of different models by token efficiency
+- **Provider Analysis**: Breakdown of usage across different providers
+- **Request Patterns**: Understanding peak usage times and patterns
+
+</TabItem>
+<TabItem value="otlp" label="OpenTelemetry Collector">
+
+Forward metrics to other observability systems:
+
+- Export to multiple backends simultaneously
+- Apply transformations and filtering
+- Integrate with existing monitoring infrastructure
+
+</TabItem>
+</Tabs>
+
+## SQLite Querying
+
+The `sqlite` sink allows you to query traces without an external system. This is particularly useful for development and custom analytics.
+
+### Example Queries
+
+```sql
+-- Query recent traces
+SELECT * FROM traces WHERE timestamp > datetime('now', '-1 hour');
+
+-- Analyze span durations
+SELECT name, AVG(duration_ms) as avg_duration
+FROM spans
+GROUP BY name
+ORDER BY avg_duration DESC;
+
+-- Find slow operations
+SELECT * FROM spans
+WHERE duration_ms > 1000
+ORDER BY duration_ms DESC;
+```
+
+:::tip[Advanced Analytics]
+Refer to the [Getting Started notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) for more examples on querying traces and spans programmatically.
+:::
+
+## Best Practices
+
+### 🔍 **Monitoring Strategy**
+- Use OpenTelemetry for production environments
+- Combine multiple sinks for development (console + SQLite)
+- Set up alerts on key metrics like token usage and error rates
+
+### 📊 **Metrics Analysis**
+- Track token usage trends to optimize costs
+- Monitor response times across different models
+- Analyze usage patterns to improve resource allocation
+
+### 🚨 **Alerting & Debugging**
+- Set up alerts for unusual token consumption spikes
+- Use trace data to debug performance issues
+- Monitor error rates and failure patterns
+
+### 🔧 **Configuration Management**
+- Use environment variables for flexible deployment
+- Configure appropriate retention policies for SQLite
+- Ensure proper network access to OpenTelemetry collectors
+
+## Integration Examples
+
+### Basic Telemetry Setup
+
+```python
+from llama_stack_client import LlamaStackClient
+
+# Client with telemetry headers
+client = LlamaStackClient(
+    base_url="http://localhost:8000",
+    extra_headers={
+        "X-Telemetry-Service": "my-ai-app",
+        "X-Telemetry-Version": "1.0.0"
+    }
+)
+
+# All API calls will be automatically traced
+response = client.inference.chat_completion(
+    model="meta-llama/Llama-3.2-3B-Instruct",
+    messages=[{"role": "user", "content": "Hello!"}]
+)
+```
+
+### Custom Telemetry Context
+
+```python
+# Add custom span attributes for better tracking
+with tracer.start_as_current_span("custom_operation") as span:
+    span.set_attribute("user_id", "user123")
+    span.set_attribute("operation_type", "chat_completion")
+
+    response = client.inference.chat_completion(
+        model="meta-llama/Llama-3.2-3B-Instruct",
+        messages=[{"role": "user", "content": "Hello!"}]
+    )
+```
+
+## Related Resources
+
+- **[Agents](./agent)** - Monitoring agent execution with telemetry
+- **[Evaluations](./evals)** - Using telemetry data for performance evaluation
+- **[Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)** - Telemetry examples and queries
+- **[OpenTelemetry Documentation](https://opentelemetry.io/)** - Comprehensive observability framework
+- **[Jaeger Documentation](https://www.jaegertracing.io/)** - Distributed tracing visualization
--- a/docs/source/building_applications/tools.md
+++ b/docs/source/building_applications/tools.md
@ -1,6 +1,17 @@
+---
+title: Tools
+description: Extend agent capabilities with external tools and function calling
+sidebar_label: Tools
+sidebar_position: 6
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Tools

 Tools are functions that can be invoked by an agent to perform tasks. They are organized into tool groups and registered with specific providers. Each tool group represents a collection of related tools from a single provider. They are organized into groups so that state can be externalized: the collection operates on the same state typically.
+
 An example of this would be a "db_access" tool group that contains tools for interacting with a database. "list_tables", "query_table", "insert_row" could be examples of tools in this group.

 Tools are treated as any other resource in llama stack like models. You can register them, have providers for them etc.
@ -9,18 +20,15 @@ When instantiating an agent, you can provide it a list of tool groups that it ha

 Refer to the [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) notebook for more examples on how to use tools.

-## Server-side vs. client-side tool execution
+## Server-side vs. Client-side Tool Execution

-Llama Stack allows you to use both server-side and client-side tools. With server-side tools, `agent.create_turn` can perform execution of the tool calls emitted by the model
-transparently giving the user the final answer desired. If client-side tools are provided, the tool call is sent back to the user for execution
-and optional continuation using the `agent.resume_turn` method.
+Llama Stack allows you to use both server-side and client-side tools. With server-side tools, `agent.create_turn` can perform execution of the tool calls emitted by the model transparently giving the user the final answer desired. If client-side tools are provided, the tool call is sent back to the user for execution and optional continuation using the `agent.resume_turn` method.

-
-### Server-side tools
+## Server-side Tools

 Llama Stack provides built-in providers for some common tools. These include web search, math, and RAG capabilities.

-#### Web Search
+### Web Search

 You have three providers to execute the web search tool calls generated by a model: Brave Search, Bing Search, and Tavily Search.

@ -39,25 +47,26 @@ The tool requires an API key which can be provided either in the configuration o
 {"<provider_name>_api_key": <your api key>}
 ```

-
-#### Math
+### Math

 The WolframAlpha tool provides access to computational knowledge through the WolframAlpha API.

 ```python
 client.toolgroups.register(
-    toolgroup_id="builtin::wolfram_alpha", provider_id="wolfram-alpha"
+    toolgroup_id="builtin::wolfram_alpha",
+    provider_id="wolfram-alpha"
 )
 ```

 Example usage:
 ```python
 result = client.tool_runtime.invoke_tool(
-    tool_name="wolfram_alpha", args={"query": "solve x^2 + 2x + 1 = 0"}
+    tool_name="wolfram_alpha",
+    args={"query": "solve x^2 + 2x + 1 = 0"}
 )
 ```

-#### RAG
+### RAG

 The RAG tool enables retrieval of context from various types of memory banks (vector, key-value, keyword, and graph).

@ -75,16 +84,13 @@ Features:
 - Configurable query generation
 - Context retrieval with token limits

-
-```{note}
+:::note[Default Configuration]
 By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
-```
+:::

 ## Model Context Protocol (MCP)

-[MCP](https://github.com/modelcontextprotocol) is an upcoming, popular standard for tool discovery and execution. It is a protocol that allows tools to be dynamically discovered
-from an MCP endpoint and can be used to extend the agent's capabilities.
-
+[MCP](https://github.com/modelcontextprotocol) is an upcoming, popular standard for tool discovery and execution. It is a protocol that allows tools to be dynamically discovered from an MCP endpoint and can be used to extend the agent's capabilities.

 ### Using Remote MCP Servers

@ -98,8 +104,7 @@ client.toolgroups.register(
 )
 ```

-Note that most of the more useful MCP servers need you to authenticate with them. Many of them use OAuth2.0 for authentication. You can provide authorization headers to send to the MCP server
-using the "Provider Data" abstraction provided by Llama Stack. When making an agent call,
+Note that most of the more useful MCP servers need you to authenticate with them. Many of them use OAuth2.0 for authentication. You can provide authorization headers to send to the MCP server using the "Provider Data" abstraction provided by Llama Stack. When making an agent call,

 ```python
 agent = Agent(
@ -120,20 +125,26 @@ agent = Agent(
 agent.create_turn(...)
 ```

-### Running your own MCP server
+### Running Your Own MCP Server

 Here's an example of how to run a simple MCP server that exposes a File System as a set of tools to the Llama Stack agent.

+<Tabs>
+<TabItem value="setup" label="Server Setup">
+
 ```shell
-# start your MCP server
+# Start your MCP server
 mkdir /tmp/content
 touch /tmp/content/foo
 touch /tmp/content/bar
 npx -y supergateway --port 8000 --stdio 'npx -y @modelcontextprotocol/server-filesystem /tmp/content'
 ```

-Then register the MCP server as a tool group,
+</TabItem>
+<TabItem value="register" label="Registration">
+
 ```python
+# Register the MCP server as a tool group
 client.toolgroups.register(
    toolgroup_id="mcp::filesystem",
    provider_id="model-context-protocol",
@ -141,12 +152,12 @@ client.toolgroups.register(
 )
 ```

-
+</TabItem>
+</Tabs>

 ## Adding Custom (Client-side) Tools

-When you want to use tools other than the built-in tools, you just need to implement a python function with a docstring. The content of the docstring will be used to describe the tool and the parameters and passed
-along to the generative model.
+When you want to use tools other than the built-in tools, you just need to implement a python function with a docstring. The content of the docstring will be used to describe the tool and the parameters and passed along to the generative model.

 ```python
 # Example tool definition
@ -158,9 +169,13 @@ def my_tool(input: int) -> int:
    """
    return input * 2
 ```
-> **NOTE:** We employ python docstrings to describe the tool and the parameters. It is important to document the tool and the parameters so that the model can use the tool correctly. It is recommended to experiment with different docstrings to see how they affect the model's behavior.
+
+:::tip[Documentation Best Practices]
+We employ python docstrings to describe the tool and the parameters. It is important to document the tool and the parameters so that the model can use the tool correctly. It is recommended to experiment with different docstrings to see how they affect the model's behavior.
+:::

 Once defined, simply pass the tool to the agent config. `Agent` will take care of the rest (calling the model with the tool definition, executing the tool, and returning the result to the model for the next iteration).
+
 ```python
 # Example agent config with client provided tools
 agent = Agent(client, ..., tools=[my_tool])
@ -168,14 +183,14 @@ agent = Agent(client, ..., tools=[my_tool])

 Refer to [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/blob/main/examples/agents/e2e_loop_with_client_tools.py) for an example of how to use client provided tools.

-
 ## Tool Invocation

 Tools can be invoked using the `invoke_tool` method:

 ```python
 result = client.tool_runtime.invoke_tool(
-    tool_name="web_search", kwargs={"query": "What is the capital of France?"}
+    tool_name="web_search",
+    kwargs={"query": "What is the capital of France?"}
 )
 ```

@ -196,7 +211,13 @@ all_tools = client.tools.list_tools()
 group_tools = client.tools.list_tools(toolgroup_id="search_tools")
 ```

-## Simple Example 2: Using an Agent with the Web Search Tool
+## Complete Examples
+
+### Web Search Agent
+
+<Tabs>
+<TabItem value="setup" label="Setup & Configuration">
+
 1. Start by registering a Tavily API key at [Tavily](https://tavily.com/).
 2. [Optional] Provide the API key directly to the Llama Stack server
 ```bash
@ -205,7 +226,10 @@ export TAVILY_SEARCH_API_KEY="your key"
 ```bash
 --env TAVILY_SEARCH_API_KEY=${TAVILY_SEARCH_API_KEY}
 ```
-3. Run the following script.
+
+</TabItem>
+<TabItem value="implementation" label="Implementation">
+
 ```python
 from llama_stack_client.lib.agents.agent import Agent
 from llama_stack_client.types.agent_create_params import AgentConfig
@ -240,7 +264,14 @@ for log in EventLogger().log(response):
    log.print()
 ```

-## Simple Example3: Using an Agent with the WolframAlpha Tool
+</TabItem>
+</Tabs>
+
+### WolframAlpha Math Agent
+
+<Tabs>
+<TabItem value="setup" label="Setup & Configuration">
+
 1. Start by registering for a WolframAlpha API key at [WolframAlpha Developer Portal](https://developer.wolframalpha.com/access).
 2. Provide the API key either when starting the Llama Stack server:
    ```bash
@ -253,12 +284,57 @@ for log in EventLogger().log(response):
        provider_data={"wolfram_alpha_api_key": wolfram_api_key},
    )
    ```
-3. Configure the tools in the Agent by setting `tools=["builtin::wolfram_alpha"]`.
-4. Example user query:
-    ```python
-    response = agent.create_turn(
-        messages=[{"role": "user", "content": "Solve x^2 + 2x + 1 = 0 using WolframAlpha"}],
-        session_id=session_id,
-    )
-    ```
+
+</TabItem>
+<TabItem value="implementation" label="Implementation">
+
+```python
+# Configure the tools in the Agent by setting tools=["builtin::wolfram_alpha"]
+agent = Agent(
+    client,
+    model="meta-llama/Llama-3.2-3B-Instruct",
+    instructions="You are a mathematical assistant that can solve complex equations.",
+    tools=["builtin::wolfram_alpha"],
+)
+
+session_id = agent.create_session("math-session")
+
+# Example user query
+response = agent.create_turn(
+    messages=[{"role": "user", "content": "Solve x^2 + 2x + 1 = 0 using WolframAlpha"}],
+    session_id=session_id,
+)
 ```
+
+</TabItem>
+</Tabs>
+
+## Best Practices
+
+### 🛠️ **Tool Selection**
+- Use **server-side tools** for production applications requiring reliability and security
+- Use **client-side tools** for development, prototyping, or specialized integrations
+- Combine multiple tool types for comprehensive functionality
+
+### 📝 **Documentation**
+- Write clear, detailed docstrings for custom tools
+- Include parameter descriptions and expected return types
+- Test tool descriptions with the model to ensure proper usage
+
+### 🔐 **Security**
+- Store API keys securely using environment variables or secure configuration
+- Use the `X-LlamaStack-Provider-Data` header for dynamic authentication
+- Validate tool inputs and outputs for security
+
+### 🔄 **Error Handling**
+- Implement proper error handling in custom tools
+- Use structured error responses with meaningful messages
+- Monitor tool performance and reliability
+
+## Related Resources
+
+- **[Agents](./agent)** - Building intelligent agents with tools
+- **[RAG (Retrieval Augmented Generation)](./rag)** - Using knowledge retrieval tools
+- **[Agent Execution Loop](./agent_execution_loop)** - Understanding tool execution flow
+- **[Building AI Applications Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)** - Comprehensive examples
+- **[Llama Stack Apps Examples](https://github.com/meta-llama/llama-stack-apps)** - Real-world tool implementations
--- a/docs/docs/concepts/apis/api_leveling.mdx
+++ b/docs/docs/concepts/apis/api_leveling.mdx
@ -0,0 +1,101 @@
+---
+title: API Stability Leveling
+description: Understanding API stability levels and versioning in Llama Stack
+sidebar_label: API Stability
+sidebar_position: 4
+---
+
+# Llama Stack API Stability Leveling
+
+In order to provide a stable experience in Llama Stack, the various APIs need different stability levels indicating the level of support, backwards compatability, and overall production readiness.
+
+## Different Levels
+
+### v1alpha
+
+- Little to no expectation of support between versions
+- Breaking changes are permitted
+- Datatypes and parameters can break
+- Routes can be added and removed
+
+#### Graduation Criteria
+
+- an API can graduate from `v1alpha` to `v1beta` if the team has identified the extent of the non-optional routes and the shape of their parameters/return types for the API eg. `/v1/openai/chat/completions`. Optional types can change.
+- CRUD must stay stable once in `v1beta`. This is a commitment to backward compatibility, guaranteeing that most code you write against the v1beta version will not break during future updates. We may make additive changes (like adding a new, optional field to a response), but we will not make breaking changes (like renaming an existing "modelName" field to "name", changing an ID's data type from an integer to a string, or altering an endpoint URL).
+- for OpenAI APIs, a comparison to the OpenAI spec for the specific API can be done to ensure completeness.
+
+### v1beta
+
+- API routes remain consistent between versions
+- Parameters and return types are not ensured between versions
+- API, besides minor fixes and adjustments, should be _almost_ v1. Changes should not be drastic.
+
+#### Graduation Criteria
+
+- an API can graduate from `v1beta` to `v1` if the API surface and datatypes are complete as identified by the team. The parameters and return types that are mandatory for each route are stable. All aspects of graduating from `v1alpha1` to `v1beta` apply as well.
+- Optional parameters, routes, or parts of the return type can be added after graduating to `v1`
+
+### v1 (stable)
+
+- Considered stable
+- Backwards compatible between Z-streams
+  - Y-stream breaking changes must go through the proper approval and announcement process.
+- Datatypes for a route and its return types cannot change between Z-streams
+  - Y-stream datatype changes should be sparing, unless the changes are additional net-new parameters
+- Must have proper conformance testing as outlined in https://github.com/llamastack/llama-stack/issues/3237
+
+### v2+ (Major Versions)
+
+Introducing a new major version like `/v2` is a significant and disruptive event that should be treated as a last resort. It is reserved for essential changes to a stable `/v1` API that are fundamentally backward-incompatible and cannot be implemented through additive, non-breaking changes or breaking changes across X/Y-Stream releases (x.y.z).
+
+If a `/v2` version is deemed absolutely necessary, it must adhere to the following protocol to ensure a sane and predictable transition for users:
+
+#### Lifecycle Progression
+
+ A new major version must follow the same stability lifecycle as `/v1`. It will be introduced as `/v2alpha`, mature to `/v2beta`, and finally become stable as `/v2`.
+
+#### Coexistence:
+
+The new `/v2` API must be introduced alongside the existing `/v1` API and run in parallel. It must not replace the `/v1` API immediately.
+
+#### Deprecation Policy:
+
+When a `/v2` API is introduced, a clear and generous deprecation policy for the `/v1` API must be published simultaneously. This policy must outline the timeline for the eventual removal of the `/v1` API, giving users ample time to migrate.
+
+### API Stability vs. Provider Stability
+
+The leveling introduced in this document relates to the stability of the API and not specifically the providers within the API.
+
+Providers can iterate as much as they want on functionality as long as they work within the bounds of an API. If they need to change the API, then the API should not be `/v1`, or those breaking changes can only happen on a y-stream release basis.
+
+### Approval and Announcement Process for Breaking Changes
+
+- **PR Labeling**: Any pull request that introduces a breaking API change must be clearly labeled with `breaking-change`.
+- **PR Title/Commit**: Any pull request that introduces a breaking API change must contain `BREAKING CHANGE` in the title and commit footer. Alternatively, the commit can include `!`, eg. `feat(api)!: title goes here` This is outlined in the [conventional commits documentation](https://www.conventionalcommits.org/en/v1.0.0/#specification)
+- **Maintainer Review**: At least one maintainer must explicitly acknowledge the breaking change during review by applying the `breaking-change` label. An approval must come with this label or the acknowledgement this label has already been applied.
+- **Announcement**: Breaking changes require inclusion in release notes and, if applicable, a separate communication (e.g., Discord, Github Issues, or GitHub Discussions) prior to release.
+
+If a PR has proper approvals, labels, and commit/title hygiene, the failing API conformance tests will be bypassed.
+
+
+## Enforcement
+
+### Migration of API routes under `/v1alpha`, `/v1beta`, and `/v1`
+
+Instead of placing every API under `/v1`, any API that is not fully stable or complete should go under `/v1alpha` or `/v1beta`. For example, at the time of this writing,  `post_training` belongs here, as well as any OpenAI-compatible API whose surface does not exactly match the upstream OpenAI API it mimics.
+
+This migration is crucial as we get Llama Stack in the hands of users who intend to productize various APIs. A clear view of what is stable and what is actively being developed will enable users to pick and choose various APIs to build their products on.
+
+This migration will be a breaking change for any API moving out of `/v1`. Ideally, this should happen before 0.3.0 and especially 1.0.0.
+
+### `x-stability` tags in the OpenAPI spec for oasdiff
+
+`x-stability` tags allow tools like oasdiff to enforce different rules for different stability levels; these tags should match the routes: [oasdiff stability](https://github.com/oasdiff/oasdiff/blob/main/docs/STABILITY.md)
+
+### Testing
+
+The testing of each stable API is already outlined in [issue #3237](https://github.com/llamastack/llama-stack/issues/3237) and is being worked on. These sorts of conformance tests should apply primarily to `/v1` APIs only, with `/v1alpha` and `/v1beta` having any tests the maintainers see fit as well as basic testing to ensure the routing works properly.
+
+### New APIs going forward
+
+Any subsequently introduced APIs should be introduced as `/v1alpha`
--- a/docs/docs/concepts/apis/api_providers.mdx
+++ b/docs/docs/concepts/apis/api_providers.mdx
@ -1,4 +1,11 @@
-## API Providers
+---
+title: API Providers
+description: Understanding remote vs inline provider implementations
+sidebar_label: API Providers
+sidebar_position: 2
+---
+
+# API Providers

 The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
 - LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, etc.),
--- a/docs/docs/concepts/apis/external.mdx
+++ b/docs/docs/concepts/apis/external.mdx
@ -1,3 +1,9 @@
+---
+title: External APIs
+description: Understanding external APIs in Llama Stack
+sidebar_label: External APIs
+sidebar_position: 3
+---
 # External APIs

 Llama Stack supports external APIs that live outside of the main codebase. This allows you to:
--- a/docs/docs/concepts/apis/index.mdx
+++ b/docs/docs/concepts/apis/index.mdx
@ -1,4 +1,11 @@
-## APIs
+---
+title: APIs
+description: Available REST APIs and planned capabilities in Llama Stack
+sidebar_label: APIs
+sidebar_position: 1
+---
+
+# APIs

 A Llama Stack API is described as a collection of REST endpoints. We currently support the following APIs:

--- a/docs/source/concepts/architecture.md
+++ b/docs/source/concepts/architecture.md
@ -1,15 +1,19 @@
-## Llama Stack architecture
+---
+title: Llama Stack Architecture
+description: Understanding Llama Stack's service-oriented design and benefits
+sidebar_label: Architecture
+sidebar_position: 2
+---
+
+# Llama Stack architecture

 Llama Stack allows you to build different layers of distributions for your AI workloads using various SDKs and API providers.

-```{image} ../../_static/llama-stack.png
-:alt: Llama Stack
-:width: 400px
-```
+<img src="/img/llama-stack.png" alt="Llama Stack" width="400" />

-### Benefits of Llama stack
+## Benefits of Llama stack

-#### Current challenges in custom AI applications
+### Current challenges in custom AI applications

 Building production AI applications today requires solving multiple challenges:

@ -32,7 +36,7 @@ Building production AI applications today requires solving multiple challenges:
 - Different providers have different APIs and abstractions.
 - Changing providers requires significant code changes.

-#### Our Solution: A Universal Stack
+### Our Solution: A Universal Stack

 Llama Stack addresses these challenges through a service-oriented, API-first approach:

@ -59,7 +63,7 @@ Llama Stack addresses these challenges through a service-oriented, API-first app
 - Ecosystem offers tailored infrastructure, software, and services for deploying a variety of models.


-### Our Philosophy
+## Our Philosophy

 - **Service-Oriented**: REST APIs enforce clean interfaces and enable seamless transitions across different environments.
 - **Composability**: Every component is independent but works together seamlessly
@ -67,4 +71,4 @@ Llama Stack addresses these challenges through a service-oriented, API-first app
 - **Turnkey Solutions**: Easy to deploy built in solutions for popular deployment scenarios


-With Llama Stack, you can focus on building your application while we handle the infrastructure complexity, essential capabilities, and provider integrations.
+With Llama Stack, you can focus on building your application while we handle the infrastructure complexity, essential capabilities, and provider integrations.
--- a/docs/source/concepts/distributions.md
+++ b/docs/source/concepts/distributions.md
@ -1,4 +1,11 @@
-## Distributions
+---
+title: Distributions
+description: Pre-packaged provider configurations for different deployment scenarios
+sidebar_label: Distributions
+sidebar_position: 3
+---
+
+# Distributions

 While there is a lot of flexibility to mix-and-match providers, often users will work with a specific set of providers (hardware support, contractual obligations, etc.) We therefore need to provide a _convenient shorthand_ for such collections. We call this shorthand a **Llama Stack Distribution** or a **Distro**. One can think of it as specific pre-packaged versions of the Llama Stack. Here are some examples:

@ -6,4 +13,4 @@ While there is a lot of flexibility to mix-and-match providers, often users will

 **Locally Hosted Distro**: You may want to run Llama Stack on your own hardware. Typically though, you still need to use Inference via an external service. You can use providers like HuggingFace TGI, Fireworks, Together, etc. for this purpose. Or you may have access to GPUs and can run a [vLLM](https://github.com/vllm-project/vllm) or [NVIDIA NIM](https://build.nvidia.com/nim?filters=nimType%3Anim_type_run_anywhere&q=llama) instance. If you "just" have a regular desktop machine, you can use [Ollama](https://ollama.com/) for inference. To provide convenient quick access to these options, we provide a number of such pre-configured locally-hosted Distros.

-**On-device Distro**: To run Llama Stack directly on an edge device (mobile phone or a tablet), we provide Distros for [iOS](../distributions/ondevice_distro/ios_sdk.md) and [Android](../distributions/ondevice_distro/android_sdk.md)
+**On-device Distro**: To run Llama Stack directly on an edge device (mobile phone or a tablet), we provide Distros for [iOS](/docs/distributions/ondevice_distro/ios_sdk) and [Android](/docs/distributions/ondevice_distro/android_sdk)
--- a/docs/source/advanced_apis/evaluation_concepts.md
+++ b/docs/source/advanced_apis/evaluation_concepts.md
@ -1,16 +1,22 @@
-## Evaluation Concepts
+---
+title: Evaluation Concepts
+description: Running evaluations on Llama Stack
+sidebar_label: Evaluation Concepts
+sidebar_position: 5
+---
+
+# Evaluation Concepts

 The Llama Stack Evaluation flow allows you to run evaluations on your GenAI application datasets or pre-registered benchmarks.

-We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
+We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications:
 - `/datasetio` + `/datasets` API
 - `/scoring` + `/scoring_functions` API
 - `/eval` + `/benchmarks` API

 This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).

-
-The Evaluation APIs are associated with a set of Resources. Please visit the Resources section in our [Core Concepts](../concepts/index.md) guide for better high-level understanding.
+The Evaluation APIs are associated with a set of Resources. Please visit the Resources section in our [Core Concepts](./index.mdx) guide for better high-level understanding.

 - **DatasetIO**: defines interface with datasets and data loaders.
  - Associated with `Dataset` resource.
@ -19,10 +25,9 @@ The Evaluation APIs are associated with a set of Resources. Please visit the Res
 - **Eval**: generate outputs (via Inference or Agents) and perform scoring.
  - Associated with `Benchmark` resource.

+## Open-benchmark Eval

-### Open-benchmark Eval
-
-#### List of open-benchmarks Llama Stack support
+### List of open-benchmarks Llama Stack support

 Llama stack pre-registers several popular open-benchmarks to easily evaluate model perfomance via CLI.

@ -32,19 +37,17 @@ The list of open-benchmarks we currently support:
 - [SimpleQA](https://openai.com/index/introducing-simpleqa/): Benchmark designed to access models to answer short, fact-seeking questions.
 - [MMMU](https://arxiv.org/abs/2311.16502) (A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI)]: Benchmark designed to evaluate multimodal models.

+You can follow this [contributing guide](../references/evals_reference/#open-benchmark-contributing-guide) to add more open-benchmarks to Llama Stack

-You can follow this [contributing guide](../references/evals_reference/index.md#open-benchmark-contributing-guide) to add more open-benchmarks to Llama Stack
-
-#### Run evaluation on open-benchmarks via CLI
+### Run evaluation on open-benchmarks via CLI

 We have built-in functionality to run the supported open-benckmarks using llama-stack-client CLI

 #### Spin up Llama Stack server

 Spin up llama stack server with 'open-benchmark' template
-```
+```bash
 llama stack run llama_stack/distributions/open-benchmark/run.yaml
-
 ```

 #### Run eval CLI
@ -52,26 +55,24 @@ There are 3 necessary inputs to run a benchmark eval
 - `list of benchmark_ids`: The list of benchmark ids to run evaluation on
 - `model-id`: The model id to evaluate on
 - `output_dir`: Path to store the evaluate results
-```
+
+```bash
 llama-stack-client eval run-benchmark <benchmark_id_1> <benchmark_id_2> ... \
 --model_id <model id to evaluate on> \
--output_dir <directory to store the evaluate results> \
+--output_dir <directory to store the evaluate results>
 ```

 You can run
-```
+```bash
 llama-stack-client eval run-benchmark help
 ```
 to see the description of all the flags that eval run-benchmark has

-
 In the output log, you can find the file path that has your evaluation results. Open that file and you can see you aggregate
 evaluation results over there.

-
-
-#### What's Next?
+## What's Next?

 - Check out our Colab notebook on working examples with running benchmark evaluations [here](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb#scrollTo=mxLCsP4MvFqP).
- Check out our [Building Applications - Evaluation](../building_applications/evals.md) guide for more details on how to use the Evaluation APIs to evaluate your applications.
- Check out our [Evaluation Reference](../references/evals_reference/index.md) for more details on the APIs.
+- Check out our [Building Applications - Evaluation](../building_applications/evals.mdx) guide for more details on how to use the Evaluation APIs to evaluate your applications.
+- Check out our [Evaluation Reference](../references/evals_reference/) for more details on the APIs.
--- a/docs/docs/concepts/index.mdx
+++ b/docs/docs/concepts/index.mdx
@ -0,0 +1,31 @@
+---
+title: Core Concepts
+description: Understanding Llama Stack's service-oriented philosophy and key concepts
+sidebar_label: Overview
+sidebar_position: 1
+---
+
+Given Llama Stack's service-oriented philosophy, a few concepts and workflows arise which may not feel completely natural in the LLM landscape, especially if you are coming with a background in other frameworks.
+
+## Documentation Structure
+
+This section covers the fundamental concepts of Llama Stack:
+
+- **[Architecture](architecture.mdx)** - Learn about Llama Stack's architectural design and principles
+- **[APIs](/docs/concepts/apis/)** - Understanding the core APIs and their stability levels
+  - [API Overview](apis/index.mdx) - Core APIs available in Llama Stack
+  - [API Providers](apis/api_providers.mdx) - How providers implement APIs
+  - [External APIs](apis/external.mdx) - External APIs available in Llama Stack
+  - [API Stability Leveling](apis/api_leveling.mdx) - API stability and versioning
+- **[Distributions](distributions.mdx)** - Pre-configured deployment packages
+- **[Resources](resources.mdx)** - Understanding Llama Stack resources and their lifecycle
+
+## Getting Started
+
+If you're new to Llama Stack, we recommend starting with:
+
+1. **[Architecture](architecture.mdx)** - Understand the overall system design
+2. **[APIs](apis/index.mdx)** - Learn about the available APIs and their purpose
+3. **[Distributions](distributions.mdx)** - Choose a pre-configured setup for your use case
+
+Each concept builds upon the previous ones to give you a comprehensive understanding of how Llama Stack works and how to use it effectively.
--- a/docs/source/concepts/resources.md
+++ b/docs/source/concepts/resources.md
@ -1,4 +1,11 @@
-## Resources
+---
+title: Resources
+description: Resource federation and registration in Llama Stack
+sidebar_label: Resources
+sidebar_position: 4
+---
+
+# Resources

 Some of these APIs are associated with a set of **Resources**. Here is the mapping of APIs to resources:

@ -12,8 +19,8 @@ Some of these APIs are associated with a set of **Resources**. Here is the mappi

 Furthermore, we allow these resources to be **federated** across multiple providers. For example, you may have some Llama models served by Fireworks while others are served by AWS Bedrock. Regardless, they will all work seamlessly with the same uniform Inference API provided by Llama Stack.

-```{admonition} Registering Resources
-:class: tip
+:::tip Registering Resources

 Given this architecture, it is necessary for the Stack to know which provider to use for a given resource. This means you need to explicitly _register_ resources (including models) before you can use them with the associated APIs.
-```
+
+:::
--- a/docs/docs/contributing/index.mdx
+++ b/docs/docs/contributing/index.mdx
@ -0,0 +1,233 @@
+# Contributing to Llama Stack
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Set up your development environment
+
+We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
+You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
+
+You can install the dependencies by running:
+
+```bash
+cd llama-stack
+uv sync --group dev
+uv pip install -e .
+source .venv/bin/activate
+```
+
+```{note}
+You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.12`).
+Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
+For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
+```
+
+Note that you can create a dotenv file `.env` that includes necessary environment variables:
+```
+LLAMA_STACK_BASE_URL=http://localhost:8321
+LLAMA_STACK_CLIENT_LOG=debug
+LLAMA_STACK_PORT=8321
+LLAMA_STACK_CONFIG=<provider-name>
+TAVILY_SEARCH_API_KEY=
+BRAVE_SEARCH_API_KEY=
+```
+
+And then use this dotenv file when running client SDK tests via the following:
+```bash
+uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
+```
+
+### Pre-commit Hooks
+
+We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
+
+```bash
+uv run pre-commit install
+```
+
+After that, pre-commit hooks will run automatically before each commit.
+
+Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
+
+```bash
+uv run pre-commit run --all-files
+```
+
+```{caution}
+Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
+```
+
+## Discussions -> Issues -> Pull Requests
+
+We actively welcome your pull requests. However, please read the following. This is heavily inspired by [Ghostty](https://github.com/ghostty-org/ghostty/blob/main/CONTRIBUTING.md).
+
+If in doubt, please open a [discussion](https://github.com/meta-llama/llama-stack/discussions); we can always convert that to an issue later.
+
+### Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+### Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Meta's open source projects.
+
+Complete your CLA here: [https://code.facebook.com/cla](https://code.facebook.com/cla)
+
+**I'd like to contribute!**
+
+If you are new to the project, start by looking at the issues tagged with "good first issue". If you're interested
+leave a comment on the issue and a triager will assign it to you.
+
+Please avoid picking up too many issues at once. This helps you stay focused and ensures that others in the community also have opportunities to contribute.
+- Try to work on only 1–2 issues at a time, especially if you’re still getting familiar with the codebase.
+- Before taking an issue, check if it’s already assigned or being actively discussed.
+- If you’re blocked or can’t continue with an issue, feel free to unassign yourself or leave a comment so others can step in.
+
+**I have a bug!**
+
+1. Search the issue tracker and discussions for similar issues.
+2. If you don't have steps to reproduce, open a discussion.
+3. If you have steps to reproduce, open an issue.
+
+**I have an idea for a feature!**
+
+1. Open a discussion.
+
+**I've implemented a feature!**
+
+1. If there is an issue for the feature, open a pull request.
+2. If there is no issue, open a discussion and link to your branch.
+
+**I have a question!**
+
+1. Open a discussion or use [Discord](https://discord.gg/llama-stack).
+
+
+**Opening a Pull Request**
+
+1. Fork the repo and create your branch from `main`.
+2. If you've changed APIs, update the documentation.
+3. Ensure the test suite passes.
+4. Make sure your code lints using `pre-commit`.
+5. If you haven't already, complete the Contributor License Agreement ("CLA").
+6. Ensure your pull request follows the [conventional commits format](https://www.conventionalcommits.org/en/v1.0.0/).
+7. Ensure your pull request follows the [coding style](#coding-style).
+
+
+Please keep pull requests (PRs) small and focused. If you have a large set of changes, consider splitting them into logically grouped, smaller PRs to facilitate review and testing.
+
+```{tip}
+As a general guideline:
+- Experienced contributors should try to keep no more than 5 open PRs at a time.
+- New contributors are encouraged to have only one open PR at a time until they’re familiar with the codebase and process.
+```
+
+## Repository guidelines
+
+### Coding Style
+
+* Comments should provide meaningful insights into the code. Avoid filler comments that simply
+  describe the next step, as they create unnecessary clutter, same goes for docstrings.
+* Prefer comments to clarify surprising behavior and/or relationships between parts of the code
+  rather than explain what the next line of code does.
+* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like
+  `Exception`.
+* Error messages should be prefixed with "Failed to ..."
+* 4 spaces for indentation rather than tab
+* When using `# noqa` to suppress a style or linter warning, include a comment explaining the
+  justification for bypassing the check.
+* When using `# type: ignore` to suppress a mypy warning, include a comment explaining the
+  justification for bypassing the check.
+* Don't use unicode characters in the codebase. ASCII-only is preferred for compatibility or
+  readability reasons.
+* Providers configuration class should be Pydantic Field class. It should have a `description` field
+  that describes the configuration. These descriptions will be used to generate the provider
+  documentation.
+* When possible, use keyword arguments only when calling functions.
+* Llama Stack utilizes custom Exception classes for certain Resources that should be used where applicable.
+
+### License
+By contributing to Llama, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
+
+## Common Tasks
+
+Some tips about common tasks you work on while contributing to Llama Stack:
+
+### Using `llama stack build`
+
+Building a stack image will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
+
+Example:
+```bash
+cd work/
+git clone https://github.com/meta-llama/llama-stack.git
+git clone https://github.com/meta-llama/llama-stack-client-python.git
+cd llama-stack
+LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --distro <...>
+```
+
+### Updating distribution configurations
+
+If you have made changes to a provider's configuration in any form (introducing a new config key, or
+changing models, etc.), you should run `./scripts/distro_codegen.py` to re-generate various YAML
+files as well as the documentation. You should not change `docs/source/.../distributions/` files
+manually as they are auto-generated.
+
+### Updating the provider documentation
+
+If you have made changes to a provider's configuration, you should run `./scripts/provider_codegen.py`
+to re-generate the documentation. You should not change `docs/source/.../providers/` files manually
+as they are auto-generated.
+Note that the provider "description" field will be used to generate the provider documentation.
+
+### Building the Documentation
+
+If you are making changes to the documentation at [https://llamastack.github.io/](https://llamastack.github.io/), you can use the following command to build the documentation and preview your changes.
+
+```bash
+# This rebuilds the documentation pages and the OpenAPI spec.
+npm install
+npm run gen-api-docs all
+npm run build
+
+# This will start a local server (usually at http://127.0.0.1:3000).
+npm run serve
+```
+
+### Update API Documentation
+
+If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command:
+
+```bash
+uv run ./docs/openapi_generator/run_openapi_generator.sh
+```
+
+The generated API schema will be available in `docs/static/`. Make sure to review the changes before committing.
+
+## Adding a New Provider
+
+See:
+- [Adding a New API Provider Page](./new_api_provider.mdx) which describes how to add new API providers to the Stack.
+- [Vector Database Page](./new_vector_database.mdx) which describes how to add a new vector databases with Llama Stack.
+- [External Provider Page](/docs/providers/external/) which describes how to add external providers to the Stack.
+
+
+## Testing
+
+
+See the [Testing README](https://github.com/meta-llama/llama-stack/blob/main/tests/README.md) for detailed testing information.
+
+## Advanced Topics
+
+For developers who need deeper understanding of the testing system internals:
+
+- [Record-Replay Testing](./testing/record-replay.mdx)
+
+### Benchmarking
+
+See the [Benchmarking README](https://github.com/meta-llama/llama-stack/blob/main/benchmarking/k8s-benchmark/README.md) for benchmarking information.
--- a/docs/source/contributing/new_api_provider.md
+++ b/docs/source/contributing/new_api_provider.md
@ -1,12 +1,20 @@
-# Adding a New API Provider
+---
+title: Adding a New API Provider
+description: Guide for adding new API providers to Llama Stack
+sidebar_label: New API Provider
+sidebar_position: 2
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';

 This guide will walk you through the process of adding a new API provider to Llama Stack.


- Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
- Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
- Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
- Update any distribution {repopath}`Templates::llama_stack/distributions/` `build.yaml` and `run.yaml` files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
+- Begin by reviewing the [core concepts](../concepts/) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
+- Determine the provider type ([Remote](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote) or [Inline](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline)). Remote providers make requests to external services, while inline providers execute implementation locally.
+- Add your provider to the appropriate [Registry](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/registry/). Specify pip dependencies necessary.
+- Update any distribution [Templates](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/distributions/) `build.yaml` and `run.yaml` files if they should include your provider by default. Run [./scripts/distro_codegen.py](https://github.com/meta-llama/llama-stack/blob/main/scripts/distro_codegen.py) if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.


 Here are some example PRs to help you get started:
@ -63,9 +71,9 @@ Before running tests, you must have required dependencies installed. This depend

 ### 1. Integration Testing

-Integration tests are located in {repopath}`tests/integration`. These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality. Since these tests use client APIs, they can be run either by pointing to an instance of the Llama Stack server or "inline" by using `LlamaStackAsLibraryClient`.
+Integration tests are located in [tests/integration](https://github.com/meta-llama/llama-stack/tree/main/tests/integration). These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality. Since these tests use client APIs, they can be run either by pointing to an instance of the Llama Stack server or "inline" by using `LlamaStackAsLibraryClient`.

-Consult {repopath}`tests/integration/README.md` for more details on how to run the tests.
+Consult [tests/integration/README.md](https://github.com/meta-llama/llama-stack/blob/main/tests/integration/README.md) for more details on how to run the tests.

 Note that each provider's `sample_run_config()` method (in the configuration class for that provider)
 typically references some environment variables for specifying API keys and the like. You can set these in the environment or pass these via the `--env` flag to the test command.
@ -73,9 +81,9 @@ Note that each provider's `sample_run_config()` method (in the configuration cla

 ### 2. Unit Testing

-Unit tests are located in {repopath}`tests/unit`. Provider-specific unit tests are located in {repopath}`tests/unit/providers`. These tests are all run automatically as part of the CI process.
+Unit tests are located in [tests/unit](https://github.com/meta-llama/llama-stack/tree/main/tests/unit). Provider-specific unit tests are located in [tests/unit/providers](https://github.com/meta-llama/llama-stack/tree/main/tests/unit/providers). These tests are all run automatically as part of the CI process.

-Consult {repopath}`tests/unit/README.md` for more details on how to run the tests manually.
+Consult [tests/unit/README.md](https://github.com/meta-llama/llama-stack/blob/main/tests/unit/README.md) for more details on how to run the tests manually.

 ### 3. Additional end-to-end testing

--- a/docs/source/contributing/new_vector_database.md
+++ b/docs/source/contributing/new_vector_database.md
@ -1,4 +1,12 @@
-# Adding a New Vector Database
+---
+title: Adding a New Vector Database
+description: Guide for adding new vector database providers to Llama Stack
+sidebar_label: New Vector Database
+sidebar_position: 3
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';

 This guide will walk you through the process of adding a new vector database to Llama Stack.

@ -31,7 +39,7 @@ filtering, sorting, and aggregating vectors.
        - `YourVectorIOAdapter.query_chunks()`
        - `YourVectorIOAdapter.delete_chunks()`
 3. **Add to Registry**: Register your provider in the appropriate registry file.
-   - Update {repopath}`llama_stack/providers/registry/vector_io.py` to include your new provider.
+   - Update [llama_stack/providers/registry/vector_io.py](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/registry/vector_io.py) to include your new provider.
 ```python
 from llama_stack.providers.registry.specs import InlineProviderSpec
 from llama_stack.providers.registry.api import Api
@ -57,7 +65,7 @@ InlineProviderSpec(
       5. Add your provider to the `vector_io_providers` fixture dictionary.
         - Please follow the naming convention of `your_vectorprovider_index` and `your_vectorprovider_adapter` as the tests require this to execute properly.
   - Integration Tests
-     - Integration tests are located in {repopath}`tests/integration`. These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality.
+     - Integration tests are located in [tests/integration](https://github.com/meta-llama/llama-stack/tree/main/tests/integration). These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality.
     - The two set of integration tests are:
       - `tests/integration/vector_io/test_vector_io.py`: This file tests registration, insertion, and retrieval.
       - `tests/integration/vector_io/test_openai_vector_stores.py`: These tests are for OpenAI-compatible vector stores and test the OpenAI API compatibility.
@ -71,5 +79,5 @@ InlineProviderSpec(
     - If you are adding tests for the `remote` provider you will have to update the `test` group, which is used in the GitHub CI for integration tests.
       - `uv add new_pip_package --group test`
 5. **Update Documentation**: Please update the documentation for end users
-   - Generate the provider documentation by running {repopath}`./scripts/provider_codegen.py`.
-   - Update the autogenerated content in the registry/vector_io.py file with information about your provider. Please see other providers for examples.
+    - Generate the provider documentation by running [./scripts/provider_codegen.py](https://github.com/meta-llama/llama-stack/blob/main/scripts/provider_codegen.py).
+    - Update the autogenerated content in the registry/vector_io.py file with information about your provider. Please see other providers for examples.
--- a/docs/source/contributing/testing/record-replay.md
+++ b/docs/source/contributing/testing/record-replay.md
@ -1,3 +1,13 @@
+---
+title: Record-Replay Testing System
+description: Understanding how Llama Stack captures and replays API interactions for testing
+sidebar_label: Record-Replay System
+sidebar_position: 4
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Record-Replay System

 Understanding how Llama Stack captures and replays API interactions for testing.
@ -228,4 +238,4 @@ Loose hashing (normalizing whitespace, rounding floats) seems convenient but hid
 - **SQLite** - Fast indexed lookups without loading response bodies
 - **Hybrid** - Best of both worlds for different use cases

-This system provides reliable, fast testing against real AI APIs while maintaining the ability to debug issues when they arise.
+This system provides reliable, fast testing against real AI APIs while maintaining the ability to debug issues when they arise.
--- a/docs/docs/deploying/aws_eks_deployment.mdx
+++ b/docs/docs/deploying/aws_eks_deployment.mdx
@ -0,0 +1,30 @@
+---
+title: AWS EKS Deployment Guide
+description: Deploy Llama Stack on AWS EKS
+sidebar_label: AWS EKS Deployment
+sidebar_position: 3
+---
+
+## AWS EKS Deployment
+
+### Prerequisites
+
+- Set up an [EKS cluster](https://docs.aws.amazon.com/eks/latest/userguide/getting-started.html)
+- Create a [GitHub OAuth app](https://docs.github.com/en/apps/oauth-apps/building-oauth-apps/creating-an-oauth-app)
+- Set authorization callback URL to `http://<your-llama-stack-ui-url>/api/auth/callback/`
+
+### Automated Deployment
+
+```bash
+export HF_TOKEN=<your-huggingface-token>
+export GITHUB_CLIENT_ID=<your-github-client-id>
+export GITHUB_CLIENT_SECRET=<your-github-client-secret>
+export LLAMA_STACK_UI_URL=<your-llama-stack-ui-url>
+
+cd docs/source/distributions/eks
+./apply.sh
+```
+
+This script will:
+- Set up default storage class for AWS EKS
+- Deploy Llama Stack server in Kubernetes pods and services
--- a/docs/docs/deploying/index.mdx
+++ b/docs/docs/deploying/index.mdx
@ -0,0 +1,14 @@
+---
+title: Deploying Llama Stack
+description: Production deployment guides for Llama Stack in various environments
+sidebar_label: Overview
+sidebar_position: 1
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Deploying Llama Stack
+
+[**→ Kubernetes Deployment Guide**](./kubernetes_deployment.mdx)
+[**→ AWS EKS Deployment Guide**](./aws_eks_deployment.mdx)
--- a/docs/source/deploying/kubernetes_deployment.md
+++ b/docs/source/deploying/kubernetes_deployment.md
@ -1,27 +1,39 @@
-## Kubernetes Deployment Guide
+---
+title: Kubernetes Deployment Guide
+description: Deploy Llama Stack on Kubernetes clusters with vLLM inference service
+sidebar_label: Kubernetes
+sidebar_position: 2
+---

-Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster.
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';

-### Prerequisites
-In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.
+# Kubernetes Deployment Guide

-Note: You can also deploy the Llama Stack server in an AWS EKS cluster. See [Deploying Llama Stack Server in AWS EKS](#deploying-llama-stack-server-in-aws-eks) for more details.
+Deploy Llama Stack and vLLM servers in a Kubernetes cluster instead of running them locally. This guide covers both local development with Kind and production deployment on AWS EKS.

-First, create a local Kubernetes cluster via Kind:
+## Prerequisites

-```
+### Local Kubernetes Setup
+
+Create a local Kubernetes cluster via Kind:
+
+```bash
 kind create cluster --image kindest/node:v1.32.0 --name llama-stack-test
 ```

-First set your hugging face token as an environment variable.
-```
+Set your Hugging Face token:
+
+```bash
 export HF_TOKEN=$(echo -n "your-hf-token" | base64)
 ```

-Now create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
+## Quick Deployment

-```
-cat <<EOF |kubectl apply -f -
+### Step 1: Create Storage and Secrets
+
+```yaml
+cat <<EOF | kubectl apply -f -
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
@ -44,11 +56,10 @@ data:
 EOF
 ```

+### Step 2: Deploy vLLM Server

-Next, start the vLLM server as a Kubernetes Deployment and Service:
-
-```
-cat <<EOF |kubectl apply -f -
+```yaml
+cat <<EOF | kubectl apply -f -
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@ -67,9 +78,7 @@ spec:
      - name: vllm
        image: vllm/vllm-openai:latest
        command: ["/bin/sh", "-c"]
-        args: [
-          "vllm serve meta-llama/Llama-3.2-1B-Instruct"
-        ]
+        args: ["vllm serve meta-llama/Llama-3.2-1B-Instruct"]
        env:
        - name: HUGGING_FACE_HUB_TOKEN
          valueFrom:
@ -101,18 +110,9 @@ spec:
 EOF
 ```

-We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
+### Step 3: Configure Llama Stack

-```
-$ kubectl logs -l app.kubernetes.io/name=vllm
-...
-INFO:     Started server process [1]
-INFO:     Waiting for application startup.
-INFO:     Application startup complete.
-INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
-```
-
-Then we can modify the Llama Stack run configuration YAML with the following inference provider:
+Update your run configuration:

 ```yaml
 providers:
@ -125,26 +125,22 @@ providers:
      api_token: fake
 ```

-Once we have defined the run configuration for Llama Stack, we can build an image with that configuration and the server source code:
+Build container image:

-```
+```bash
 tmp_dir=$(mktemp -d) && cat >$tmp_dir/Containerfile.llama-stack-run-k8s <<EOF
 FROM distribution-myenv:dev
-
 RUN apt-get update && apt-get install -y git
 RUN git clone https://github.com/meta-llama/llama-stack.git /app/llama-stack-source
-
 ADD ./vllm-llama-stack-run-k8s.yaml /app/config.yaml
 EOF
 podman build -f $tmp_dir/Containerfile.llama-stack-run-k8s -t llama-stack-run-k8s $tmp_dir
 ```

-### Deploying Llama Stack Server in Kubernetes
+### Step 4: Deploy Llama Stack Server

-We can then start the Llama Stack server by deploying a Kubernetes Pod and Service:
-
-```
-cat <<EOF |kubectl apply -f -
+```yaml
+cat <<EOF | kubectl apply -f -
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
@ -200,48 +196,29 @@ spec:
 EOF
 ```

-### Verifying the Deployment
-We can check that the LlamaStack server has started:
+### Step 5: Test Deployment

-```
-$ kubectl logs -l app.kubernetes.io/name=llama-stack
-...
-INFO:     Started server process [1]
-INFO:     Waiting for application startup.
-INFO:     ASGI 'lifespan' protocol appears unsupported.
-INFO:     Application startup complete.
-INFO:     Uvicorn running on http://['::', '0.0.0.0']:5000 (Press CTRL+C to quit)
-```
-
-Finally, we forward the Kubernetes service to a local port and test some inference requests against it via the Llama Stack Client:
-
-```
+```bash
+# Port forward and test
 kubectl port-forward service/llama-stack-service 5000:5000
 llama-stack-client --endpoint http://localhost:5000 inference chat-completion --message "hello, what model are you?"
 ```

-## Deploying Llama Stack Server in AWS EKS
+## Troubleshooting

-We've also provided a script to deploy the Llama Stack server in an AWS EKS cluster.
-
-Prerequisites:
- Set up an [EKS cluster](https://docs.aws.amazon.com/eks/latest/userguide/getting-started.html).
- Create a [Github OAuth app](https://docs.github.com/en/apps/oauth-apps/building-oauth-apps/creating-an-oauth-app) and get the client ID and client secret.
-  - Set the `Authorization callback URL` to `http://<your-llama-stack-ui-url>/api/auth/callback/`
-
-
-Run the following script to deploy the Llama Stack server:
-```
-export HF_TOKEN=<your-huggingface-token>
-export GITHUB_CLIENT_ID=<your-github-client-id>
-export GITHUB_CLIENT_SECRET=<your-github-client-secret>
-export LLAMA_STACK_UI_URL=<your-llama-stack-ui-url>
-
-cd docs/source/distributions/eks
-./apply.sh
+**Check pod status:**
+```bash
+kubectl get pods -l app.kubernetes.io/name=vllm
+kubectl logs -l app.kubernetes.io/name=vllm
 ```

-This script will:
+**Test service connectivity:**
+```bash
+kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- curl http://vllm-server:8000/v1/models
+```

- Set up a default storage class for AWS EKS
- Deploy the Llama Stack server in a Kubernetes Pod and Service
+## Related Resources
+
+- **[Deployment Overview](/docs/deploying/)** - Overview of deployment options
+- **[Distributions](/docs/distributions)** - Understanding Llama Stack distributions
+- **[Configuration](/docs/distributions/configuration)** - Detailed configuration options
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -1,5 +1,9 @@
-# Build your own Distribution
-
+---
+title: Building Custom Distributions
+description: Building a Llama Stack distribution from scratch
+sidebar_label: Build your own Distribution
+sidebar_position: 3
+---

 This guide will walk you through the steps to get started with building a Llama Stack distribution from scratch with your choice of API providers.

@ -82,8 +86,11 @@ options:

 After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.

-::::{tab-set}
-:::{tab-item} Building from a template
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+<Tabs>
+<TabItem value="template" label="Building from a template">
 To build from alternative API providers, we provide distribution templates for users to get started building a distribution backed by different providers.

 The following command will allow you to see the available templates and their corresponding providers.
@ -156,8 +163,8 @@ You can now edit ~/.llama/distributions/llamastack-starter/starter-run.yaml and
 ```{tip}
 The generated `run.yaml` file is a starting point for your configuration. For comprehensive guidance on customizing it for your specific needs, infrastructure, and deployment scenarios, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
 ```
-:::
-:::{tab-item} Building from Scratch
+</TabItem>
+<TabItem value="scratch" label="Building from Scratch">

 If the provided templates do not fit your use case, you could start off with running `llama stack build` which will allow you to a interactively enter wizard where you will be prompted to enter build configurations.

@ -186,9 +193,8 @@ Tip: use <TAB> to see options for the providers.

 You can now edit ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml`
 ```
-:::
-
-:::{tab-item} Building from a pre-existing build config file
+</TabItem>
+<TabItem value="config" label="Building from a pre-existing build config file">
 - In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.

 - The config file will be of contents like the ones in `llama_stack/distributions/*build.yaml`.
@ -196,9 +202,8 @@ You can now edit ~/.llama/distributions/llamastack-my-local-stack/my-local-stack
 ```
 llama stack build --config llama_stack/distributions/starter/build.yaml
 ```
-:::
-
-:::{tab-item} Building with External Providers
+</TabItem>
+<TabItem value="external" label="Building with External Providers">

 Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently or use community-provided providers.

@ -246,16 +251,13 @@ directory or a git repository (git must be installed on the build environment).
 llama stack build --config my-external-stack.yaml
 ```

-For more information on external providers, including directory structure, provider types, and implementation requirements, see the [External Providers documentation](../providers/external.md).
-:::
-
-:::{tab-item} Building Container
-
-```{admonition} Podman Alternative
-:class: tip
+For more information on external providers, including directory structure, provider types, and implementation requirements, see the [External Providers documentation](../providers/external/).
+</TabItem>
+<TabItem value="container" label="Building Container">

+:::tip Podman Alternative
 Podman is supported as an alternative to Docker. Set `CONTAINER_BINARY` to `podman` in your environment to use Podman.
-```
+:::

 To build a container image, you may start off from a template and use the `--image-type container` flag to specify `container` as the build image type.

@ -274,7 +276,8 @@ You can now edit ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml and run `l
 ```

 Now set some environment variables for the inference model ID and Llama Stack Port and create a local directory to mount into the container's file system.
-```
+
+```bash
 export INFERENCE_MODEL="llama3.2:3b"
 export LLAMA_STACK_PORT=8321
 mkdir -p ~/.llama
@ -308,9 +311,8 @@ Here are the docker flags and their uses:

 * `--env OLLAMA_URL=http://host.docker.internal:11434`: Configures the URL for the Ollama service

-:::
-
-::::
+</TabItem>
+</Tabs>


 ### Running your Stack server
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@ -1,3 +1,9 @@
+---
+title: Configuring a "Stack"
+description: Configuring a "Stack"
+sidebar_label: Configuring a "Stack"
+sidebar_position: 6
+---
 # Configuring a "Stack"

 The Llama Stack runtime configuration is specified as a YAML file. Here is a simplified version of an example configuration file for the Ollama distribution:
@ -200,7 +206,7 @@ models:
  provider_model_id: null
  model_type: llm
 ```
-A Model is an instance of a "Resource" (see [Concepts](../concepts/index)) and is associated with a specific inference provider (in this case, the provider with identifier `ollama`). This is an instance of a "pre-registered" model. While we always encourage the clients to register models before using them, some Stack servers may come up a list of "already known and available" models.
+A Model is an instance of a "Resource" (see [Concepts](../concepts/)) and is associated with a specific inference provider (in this case, the provider with identifier `ollama`). This is an instance of a "pre-registered" model. While we always encourage the clients to register models before using them, some Stack servers may come up a list of "already known and available" models.

 What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. Contrast it with `model_id` which is the identifier for the same model for Llama Stack's purposes. For example, you may want to name "llama3.2:vision-11b" as "image_captioning_model" when you use it in your Stack interactions. When omitted, the server will set `provider_model_id` to be the same as `model_id`.

@ -472,12 +478,12 @@ A rule may also specify a condition, either a 'when' or an 'unless',
 with additional constraints as to where the rule applies. The
 constraints supported at present are:

- - 'user with <attr-value> in <attr-name>'
- - 'user with <attr-value> not in <attr-name>'
+ - 'user with `<attr-value>` in `<attr-name>`'
+ - 'user with `<attr-value>` not in `<attr-name>`'
 - 'user is owner'
 - 'user is not owner'
- - 'user in owners <attr-name>'
- - 'user not in owners <attr-name>'
+ - 'user in owners `<attr-name>`'
+ - 'user not in owners `<attr-name>`'

 The attributes defined for a user will depend on how the auth
 configuration is defined.
--- a/docs/source/distributions/customizing_run_yaml.md
+++ b/docs/source/distributions/customizing_run_yaml.md
@ -1,3 +1,9 @@
+---
+title: Customizing run.yaml
+description: Customizing run.yaml files for Llama Stack templates
+sidebar_label: Customizing run.yaml
+sidebar_position: 4
+---
 # Customizing run.yaml Files

 The `run.yaml` files generated by Llama Stack templates are **starting points** designed to be customized for your specific needs. They are not meant to be used as-is in production environments.
@ -37,4 +43,4 @@ your-project/
 └── README.md
 ```

-The goal is to take the generated template and adapt it to your specific infrastructure and operational needs.
+The goal is to take the generated template and adapt it to your specific infrastructure and operational needs.
--- a/docs/source/distributions/eks/apply.sh
+++ b/docs/source/distributions/eks/apply.sh
--- a/docs/source/distributions/eks/gp3-topology-aware.yaml
+++ b/docs/source/distributions/eks/gp3-topology-aware.yaml
--- a/docs/source/distributions/importing_as_library.md
+++ b/docs/source/distributions/importing_as_library.md
@ -1,3 +1,9 @@
+---
+title: Using Llama Stack as a Library
+description: How to use Llama Stack as a Python library instead of running a server
+sidebar_label: Importing as Library
+sidebar_position: 5
+---
 # Using Llama Stack as a Library

 ## Setup Llama Stack without a Server
@ -27,7 +33,7 @@ Then, you can access the APIs like `models` and `inference` on the client and ca
 response = client.models.list()
 ```

-If you've created a [custom distribution](building_distro.md), you can also use the run.yaml configuration file directly:
+If you've created a [custom distribution](./building_distro), you can also use the run.yaml configuration file directly:

 ```python
 client = LlamaStackAsLibraryClient(config_path)
--- a/docs/docs/distributions/index.mdx
+++ b/docs/docs/distributions/index.mdx
@ -0,0 +1,21 @@
+---
+title: Distributions Overview
+description: Pre-packaged sets of Llama Stack components for different deployment scenarios
+sidebar_label: Overview
+sidebar_position: 1
+---
+
+# Distributions Overview
+
+A distribution is a pre-packaged set of Llama Stack components that can be deployed together.
+
+This section provides an overview of the distributions available in Llama Stack.
+
+## Distribution Guides
+
+- **[Available Distributions](./list_of_distributions.mdx)** - Complete list and comparison of all distributions
+- **[Building Custom Distributions](./building_distro.mdx)** - Create your own distribution from scratch
+- **[Customizing Configuration](./customizing_run_yaml.mdx)** - Customize run.yaml for your needs
+- **[Starting Llama Stack Server](./starting_llama_stack_server.mdx)** - How to run distributions
+- **[Importing as Library](./importing_as_library.mdx)** - Use distributions in your code
+- **[Configuration Reference](./configuration.mdx)** - Configuration file format details
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
--- a/docs/source/distributions/k8s/chroma-k8s.yaml.template
+++ b/docs/source/distributions/k8s/chroma-k8s.yaml.template
--- a/docs/source/distributions/k8s/hf-token-secret.yaml.template
+++ b/docs/source/distributions/k8s/hf-token-secret.yaml.template
--- a/docs/source/distributions/k8s/ingress-k8s.yaml.template
+++ b/docs/source/distributions/k8s/ingress-k8s.yaml.template
--- a/docs/source/distributions/k8s/postgres-k8s.yaml.template
+++ b/docs/source/distributions/k8s/postgres-k8s.yaml.template
--- a/docs/source/distributions/k8s/stack-configmap.yaml
+++ b/docs/source/distributions/k8s/stack-configmap.yaml
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
--- a/docs/source/distributions/k8s/stack_run_config.yaml
+++ b/docs/source/distributions/k8s/stack_run_config.yaml
--- a/docs/source/distributions/k8s/ui-k8s.yaml.template
+++ b/docs/source/distributions/k8s/ui-k8s.yaml.template
--- a/docs/source/distributions/k8s/vllm-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template
--- a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template
--- a/docs/source/distributions/list_of_distributions.md
+++ b/docs/source/distributions/list_of_distributions.md
@ -1,3 +1,10 @@
+---
+title: Available Distributions
+description: List of available distributions for Llama Stack
+sidebar_label: Available Distributions
+sidebar_position: 2
+---
+
 # Available Distributions

 Llama Stack provides several pre-configured distributions to help you get started quickly. Choose the distribution that best fits your hardware and use case.
@ -55,7 +62,7 @@ docker pull llama-stack/distribution-meta-reference-gpu

 **Partners:** [Fireworks.ai](https://fireworks.ai) and [Together.xyz](https://together.xyz)

-**Guides:** [Remote-Hosted Endpoints](remote_hosted_distro/index)
+**Guides:** [Remote-Hosted Endpoints](./remote_hosted_distro/)

 ### 📱 Mobile Development

@ -74,7 +81,7 @@ docker pull llama-stack/distribution-meta-reference-gpu
 - You need custom configurations
 - You want to optimize for your specific use case

-**Guides:** [Building Custom Distributions](building_distro.md)
+**Guides:** [Building Custom Distributions](./building_distro)

 ## Detailed Documentation

@ -124,4 +131,4 @@ graph TD
 3. **Configure your providers** with API keys or local models
 4. **Start building** with Llama Stack!

-For help choosing or troubleshooting, check our [Getting Started Guide](../getting_started/index.md) or [Community Support](https://github.com/llama-stack/llama-stack/discussions).
+For help choosing or troubleshooting, check our [Getting Started Guide](/docs/getting_started/quickstart) or [Community Support](https://github.com/llama-stack/llama-stack/discussions).
--- a/docs/source/distributions/ondevice_distro/android_sdk.md
+++ b/docs/source/distributions/ondevice_distro/android_sdk.md
@ -66,7 +66,7 @@ llama stack run starter --port 5050

 Ensure the Llama Stack server version is the same as the Kotlin SDK Library for maximum compatibility.

-Other inference providers: [Table](../../index.md#supported-llama-stack-implementations)
+Other inference providers: [Table](/docs/)

 How to set remote localhost in Demo App: [Settings](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/examples/android_app#settings)

--- a/docs/source/distributions/ondevice_distro/ios_sdk.md
+++ b/docs/source/distributions/ondevice_distro/ios_sdk.md
--- a/docs/source/distributions/remote_hosted_distro/index.md
+++ b/docs/source/distributions/remote_hosted_distro/index.md
--- a/docs/source/distributions/remote_hosted_distro/watsonx.md
+++ b/docs/source/distributions/remote_hosted_distro/watsonx.md
--- a/docs/source/distributions/self_hosted_distro/dell-tgi.md
+++ b/docs/source/distributions/self_hosted_distro/dell-tgi.md
--- a/docs/source/distributions/self_hosted_distro/dell.md
+++ b/docs/source/distributions/self_hosted_distro/dell.md
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@ -11,6 +11,7 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 | agents | `inline::meta-reference` |
 | datasetio | `inline::localfs`, `remote::nvidia` |
 | eval | `remote::nvidia` |
+| files | `inline::localfs` |
 | inference | `remote::nvidia` |
 | post_training | `remote::nvidia` |
 | safety | `remote::nvidia` |
@ -36,25 +37,6 @@ The following environment variables can be configured:
 - `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
 - `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)

-### Models
-
-The following models are available by default:
-
- `meta/llama3-8b-instruct `
- `meta/llama3-70b-instruct `
- `meta/llama-3.1-8b-instruct `
- `meta/llama-3.1-70b-instruct `
- `meta/llama-3.1-405b-instruct `
- `meta/llama-3.2-1b-instruct `
- `meta/llama-3.2-3b-instruct `
- `meta/llama-3.2-11b-vision-instruct `
- `meta/llama-3.2-90b-vision-instruct `
- `meta/llama-3.3-70b-instruct `
- `nvidia/vila `
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
- `nvidia/nv-embedqa-e5-v5 `
- `nvidia/nv-embedqa-mistral-7b-v2 `
- `snowflake/arctic-embed-l `


 ## Prerequisites
@ -78,22 +60,22 @@ The deployed platform includes the NIM Proxy microservice, which is the service
 ### Datasetio API: NeMo Data Store
 The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.

-See the {repopath}`NVIDIA Datasetio docs::llama_stack/providers/remote/datasetio/nvidia/README.md` for supported features and example usage.
+See the [NVIDIA Datasetio docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.

 ### Eval API: NeMo Evaluator
 The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.

-See the {repopath}`NVIDIA Eval docs::llama_stack/providers/remote/eval/nvidia/README.md` for supported features and example usage.
+See the [NVIDIA Eval docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.

 ### Post-Training API: NeMo Customizer
-The NeMo Customizer microservice supports fine-tuning models. You can reference {repopath}`this list of supported models::llama_stack/providers/remote/post_training/nvidia/models.py` that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
+The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.

-See the {repopath}`NVIDIA Post-Training docs::llama_stack/providers/remote/post_training/nvidia/README.md` for supported features and example usage.
+See the [NVIDIA Post-Training docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.

 ### Safety API: NeMo Guardrails
 The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.

-See the {repopath}`NVIDIA Safety docs::llama_stack/providers/remote/safety/nvidia/README.md` for supported features and example usage.
+See the [NVIDIA Safety docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/safety/nvidia/README.md) for supported features and example usage.

 ## Deploying models
 In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
@ -167,4 +149,4 @@ llama stack run ./run.yaml \
 ```

 ## Example Notebooks
-For examples of how to use the NVIDIA Distribution to run inference, fine-tune, evaluate, and run safety checks on your LLMs, you can reference the example notebooks in {repopath}`docs/notebooks/nvidia`.
+For examples of how to use the NVIDIA Distribution to run inference, fine-tune, evaluate, and run safety checks on your LLMs, you can reference the example notebooks in [docs/notebooks/nvidia](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks/nvidia).
--- a/docs/source/distributions/self_hosted_distro/passthrough.md
+++ b/docs/source/distributions/self_hosted_distro/passthrough.md
--- a/docs/source/distributions/self_hosted_distro/starter.md
+++ b/docs/source/distributions/self_hosted_distro/starter.md
@ -36,25 +36,25 @@ The starter distribution includes a comprehensive set of inference providers:

 ### Hosted Providers
 - **[OpenAI](https://openai.com/api/)**: GPT-4, GPT-3.5, O1, O3, O4 models and text embeddings -
-  provider ID: `openai` - reference documentation: [openai](../../providers/inference/remote_openai.md)
+  provider ID: `openai` - reference documentation: [openai](../../providers/inference/remote_openai)
 - **[Fireworks](https://fireworks.ai/)**: Llama 3.1, 3.2, 3.3, 4 Scout, 4 Maverick models and
-  embeddings - provider ID: `fireworks` - reference documentation: [fireworks](../../providers/inference/remote_fireworks.md)
+  embeddings - provider ID: `fireworks` - reference documentation: [fireworks](../../providers/inference/remote_fireworks)
 - **[Together](https://together.ai/)**: Llama 3.1, 3.2, 3.3, 4 Scout, 4 Maverick models and
-  embeddings - provider ID: `together` - reference documentation: [together](../../providers/inference/remote_together.md)
- **[Anthropic](https://www.anthropic.com/)**: Claude 3.5 Sonnet, Claude 3.7 Sonnet, Claude 3.5 Haiku, and Voyage embeddings - provider ID: `anthropic` - reference documentation: [anthropic](../../providers/inference/remote_anthropic.md)
- **[Gemini](https://gemini.google.com/)**: Gemini 1.5, 2.0, 2.5 models and text embeddings - provider ID: `gemini` - reference documentation: [gemini](../../providers/inference/remote_gemini.md)
- **[Groq](https://groq.com/)**: Fast Llama models (3.1, 3.2, 3.3, 4 Scout, 4 Maverick) - provider ID: `groq` - reference documentation: [groq](../../providers/inference/remote_groq.md)
- **[SambaNova](https://www.sambanova.ai/)**: Llama 3.1, 3.2, 3.3, 4 Scout, 4 Maverick models - provider ID: `sambanova` - reference documentation: [sambanova](../../providers/inference/remote_sambanova.md)
- **[Cerebras](https://www.cerebras.ai/)**: Cerebras AI models - provider ID: `cerebras` - reference documentation: [cerebras](../../providers/inference/remote_cerebras.md)
- **[NVIDIA](https://www.nvidia.com/)**: NVIDIA NIM - provider ID: `nvidia` - reference documentation: [nvidia](../../providers/inference/remote_nvidia.md)
- **[HuggingFace](https://huggingface.co/)**: Serverless and endpoint models - provider ID: `hf::serverless` and `hf::endpoint` - reference documentation: [huggingface-serverless](../../providers/inference/remote_hf_serverless.md) and [huggingface-endpoint](../../providers/inference/remote_hf_endpoint.md)
- **[Bedrock](https://aws.amazon.com/bedrock/)**: AWS Bedrock models - provider ID: `bedrock` - reference documentation: [bedrock](../../providers/inference/remote_bedrock.md)
+  embeddings - provider ID: `together` - reference documentation: [together](../../providers/inference/remote_together)
+- **[Anthropic](https://www.anthropic.com/)**: Claude 3.5 Sonnet, Claude 3.7 Sonnet, Claude 3.5 Haiku, and Voyage embeddings - provider ID: `anthropic` - reference documentation: [anthropic](../../providers/inference/remote_anthropic)
+- **[Gemini](https://gemini.google.com/)**: Gemini 1.5, 2.0, 2.5 models and text embeddings - provider ID: `gemini` - reference documentation: [gemini](../../providers/inference/remote_gemini)
+- **[Groq](https://groq.com/)**: Fast Llama models (3.1, 3.2, 3.3, 4 Scout, 4 Maverick) - provider ID: `groq` - reference documentation: [groq](../../providers/inference/remote_groq)
+- **[SambaNova](https://www.sambanova.ai/)**: Llama 3.1, 3.2, 3.3, 4 Scout, 4 Maverick models - provider ID: `sambanova` - reference documentation: [sambanova](../../providers/inference/remote_sambanova)
+- **[Cerebras](https://www.cerebras.ai/)**: Cerebras AI models - provider ID: `cerebras` - reference documentation: [cerebras](../../providers/inference/remote_cerebras)
+- **[NVIDIA](https://www.nvidia.com/)**: NVIDIA NIM - provider ID: `nvidia` - reference documentation: [nvidia](../../providers/inference/remote_nvidia)
+- **[HuggingFace](https://huggingface.co/)**: Serverless and endpoint models - provider ID: `hf::serverless` and `hf::endpoint` - reference documentation: [huggingface-serverless](../../providers/inference/remote_hf_serverless) and [huggingface-endpoint](../../providers/inference/remote_hf_endpoint)
+- **[Bedrock](https://aws.amazon.com/bedrock/)**: AWS Bedrock models - provider ID: `bedrock` - reference documentation: [bedrock](../../providers/inference/remote_bedrock)

 ### Local/Remote Providers
- **[Ollama](https://ollama.ai/)**: Local Ollama models - provider ID: `ollama` - reference documentation: [ollama](../../providers/inference/remote_ollama.md)
- **[vLLM](https://docs.vllm.ai/en/latest/)**: Local or remote vLLM server - provider ID: `vllm` - reference documentation: [vllm](../../providers/inference/remote_vllm.md)
- **[TGI](https://github.com/huggingface/text-generation-inference)**: Text Generation Inference server - Dell Enterprise Hub's custom TGI container too (use `DEH_URL`) - provider ID: `tgi` - reference documentation: [tgi](../../providers/inference/remote_tgi.md)
- **[Sentence Transformers](https://www.sbert.net/)**: Local embedding models - provider ID: `sentence-transformers` - reference documentation: [sentence-transformers](../../providers/inference/inline_sentence-transformers.md)
+- **[Ollama](https://ollama.ai/)**: Local Ollama models - provider ID: `ollama` - reference documentation: [ollama](../../providers/inference/remote_ollama)
+- **[vLLM](https://docs.vllm.ai/en/latest/)**: Local or remote vLLM server - provider ID: `vllm` - reference documentation: [vllm](../../providers/inference/remote_vllm)
+- **[TGI](https://github.com/huggingface/text-generation-inference)**: Text Generation Inference server - Dell Enterprise Hub's custom TGI container too (use `DEH_URL`) - provider ID: `tgi` - reference documentation: [tgi](../../providers/inference/remote_tgi)
+- **[Sentence Transformers](https://www.sbert.net/)**: Local embedding models - provider ID: `sentence-transformers` - reference documentation: [sentence-transformers](../../providers/inference/inline_sentence-transformers)

 All providers are disabled by default. So you need to enable them by setting the environment variables.

--- a/docs/source/distributions/starting_llama_stack_server.md
+++ b/docs/source/distributions/starting_llama_stack_server.md
@ -1,3 +1,10 @@
+---
+title: Starting a Llama Stack Server
+description: Different ways to run Llama Stack servers - as library, container, or Kubernetes deployment
+sidebar_label: Starting Llama Stack Server
+sidebar_position: 7
+---
+
 # Starting a Llama Stack Server

 You can run a Llama Stack server in one of the following ways:
@ -9,11 +16,11 @@ This is the simplest way to get started. Using Llama Stack as a library means yo

 ## Container:

-Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
+Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](./list_of_distributions) for more details.

 ## Kubernetes:

-If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
+If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](../deploying/kubernetes_deployment) for more details.


 ```{toctree}
--- a/docs/source/getting_started/demo_script.py
+++ b/docs/source/getting_started/demo_script.py
--- a/docs/source/getting_started/detailed_tutorial.md
+++ b/docs/source/getting_started/detailed_tutorial.md
@ -1,3 +1,13 @@
+---
+title: Detailed Tutorial
+description: Complete guide to using Llama Stack server and client SDK to build AI agents
+sidebar_label: Detailed Tutorial
+sidebar_position: 3
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 ## Detailed Tutorial

 In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to test a simple agent.
@ -8,7 +18,7 @@ In Llama Stack, we provide a server exposing multiple APIs. These APIs are backe
 Llama Stack is a stateful service with REST APIs to support seamless transition of AI applications across different environments. The server can be run in a variety of ways, including as a standalone binary, Docker container, or hosted service. You can build and test using a local server first and deploy to a hosted endpoint for production.

 In this guide, we'll walk through how to build a RAG agent locally using Llama Stack with [Ollama](https://ollama.com/)
-as the inference [provider](../providers/index.md#inference) for a Llama Model.
+as the inference [provider](/docs/providers/inference/) for a Llama Model.

 ### Step 1: Installation and Setup

@ -21,23 +31,21 @@ ollama run llama3.2:3b --keepalive 60m

 Install [uv](https://docs.astral.sh/uv/) to setup your virtual environment

-::::{tab-set}
-
-:::{tab-item} macOS and Linux
+<Tabs>
+<TabItem value="unix" label="macOS and Linux">
 Use `curl` to download the script and execute it with `sh`:
 ```console
 curl -LsSf https://astral.sh/uv/install.sh | sh
 ```
-:::
-
-:::{tab-item} Windows
+</TabItem>
+<TabItem value="windows" label="Windows">
 Use `irm` to download the script and execute it with `iex`:

 ```console
 powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
 ```
-:::
-::::
+</TabItem>
+</Tabs>

 Setup your virtual environment.

@ -48,36 +56,24 @@ source .venv/bin/activate
 ### Step 2:  Run Llama Stack
 Llama Stack is a server that exposes multiple APIs, you connect with it using the Llama Stack client SDK.

-::::{tab-set}
-
-:::{tab-item} Using `venv`
+<Tabs>
+<TabItem value="venv" label="Using venv">
 You can use Python to build and run the Llama Stack server, which is useful for testing and development.

-Llama Stack uses a [YAML configuration file](../distributions/configuration.md) to specify the stack setup,
-which defines the providers and their settings. The generated configuration serves as a starting point that you can [customize for your specific needs](../distributions/customizing_run_yaml.md).
+Llama Stack uses a [YAML configuration file](../distributions/configuration) to specify the stack setup,
+which defines the providers and their settings. The generated configuration serves as a starting point that you can [customize for your specific needs](../distributions/customizing_run_yaml).
 Now let's build and run the Llama Stack config for Ollama.
 We use `starter` as template. By default all providers are disabled, this requires enable ollama by passing environment variables.

 ```bash
 llama stack build --distro starter --image-type venv --run
 ```
-:::
-:::{tab-item} Using `venv`
-You can use Python to build and run the Llama Stack server, which is useful for testing and development.
-
-Llama Stack uses a [YAML configuration file](../distributions/configuration.md) to specify the stack setup,
-which defines the providers and their settings.
-Now let's build and run the Llama Stack config for Ollama.
-
-```bash
-llama stack build --distro starter --image-type venv --run
-```
-:::
-:::{tab-item} Using a Container
+</TabItem>
+<TabItem value="container" label="Using a Container">
 You can use a container image to run the Llama Stack server. We provide several container images for the server
 component that works with different inference providers out of the box. For this guide, we will use
 `llamastack/distribution-starter` as the container image. If you'd like to build your own image or customize the
-configurations, please check out [this guide](../distributions/building_distro.md).
+configurations, please check out [this guide](../distributions/building_distro).
 First lets setup some environment variables and create a local directory to mount into the container’s file system.
 ```bash
 export LLAMA_STACK_PORT=8321
@ -100,9 +96,8 @@ with `host.containers.internal`.

 The configuration YAML for the Ollama distribution is available at `distributions/ollama/run.yaml`.

-```{tip}
-
-Docker containers run in their own isolated network namespaces on Linux. To allow the container to communicate with services running on the host via `localhost`, you need `--network=host`. This makes the container use the host’s network directly so it can connect to Ollama running on `localhost:11434`.
+:::tip
+Docker containers run in their own isolated network namespaces on Linux. To allow the container to communicate with services running on the host via `localhost`, you need `--network=host`. This makes the container use the host's network directly so it can connect to Ollama running on `localhost:11434`.

 Linux users having issues running the above command should instead try the following:
 ```bash
@ -116,7 +111,6 @@ docker run -it \
  --env OLLAMA_URL=http://localhost:11434
 ```
 :::
-::::
 You will see output like below:
 ```
 INFO:     Application startup complete.
@ -127,33 +121,31 @@ Now you can use the Llama Stack client to run inference and build agents!

 You can reuse the server setup or use the [Llama Stack Client](https://github.com/meta-llama/llama-stack-client-python/).
 Note that the client package is already included in the `llama-stack` package.
+</TabItem>
+</Tabs>

 ### Step 3: Run Client CLI

 Open a new terminal and navigate to the same directory you started the server from. Then set up a new or activate your
 existing server virtual environment.

-::::{tab-set}
-
-:::{tab-item} Reuse Server `venv`
+<Tabs>
+<TabItem value="reuse" label="Reuse Server venv">
 ```bash
 # The client is included in the llama-stack package so we just activate the server venv
 source .venv/bin/activate
 ```
-:::
-
-:::{tab-item} Install with `venv`
+</TabItem>
+<TabItem value="install" label="Install with venv">
 ```bash
 uv venv client --python 3.12
 source client/bin/activate
 pip install llama-stack-client
 ```
-:::
+</TabItem>
+</Tabs>

-
-::::
-
-Now let's use the `llama-stack-client` [CLI](../references/llama_stack_client_cli_reference.md) to check the
+Now let's use the `llama-stack-client` [CLI](../references/llama_stack_client_cli_reference) to check the
 connectivity to the server.

 ```bash
@ -224,12 +216,11 @@ OpenAIChatCompletion(

 ### Step 4: Run the Demos

-Note that these demos show the [Python Client SDK](../references/python_sdk_reference/index.md).
-Other SDKs are also available, please refer to the [Client SDK](../index.md#client-sdks) list for the complete options.
+Note that these demos show the [Python Client SDK](../references/python_sdk_reference/).
+Other SDKs are also available, please refer to the [Client SDK](/docs/) list for the complete options.

-::::{tab-set}
-
-:::{tab-item} Basic Inference
+<Tabs>
+<TabItem value="inference" label="Basic Inference">
 Now you can run inference using the Llama Stack client SDK.

 #### i. Create the Script
@ -269,9 +260,8 @@ Which will output:
 Model: ollama/llama3.2:3b
 OpenAIChatCompletion(id='chatcmpl-30cd0f28-a2ad-4b6d-934b-13707fc60ebf', choices=[OpenAIChatCompletionChoice(finish_reason='stop', index=0, message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(role='assistant', content="Lines of code unfold\nAlgorithms dance with ease\nLogic's gentle kiss", name=None, tool_calls=None, refusal=None, annotations=None, audio=None, function_call=None), logprobs=None)], created=1751732480, model='llama3.2:3b', object='chat.completion', service_tier=None, system_fingerprint='fp_ollama', usage={'completion_tokens': 16, 'prompt_tokens': 37, 'total_tokens': 53, 'completion_tokens_details': None, 'prompt_tokens_details': None})
 ```
-:::
-
-:::{tab-item} Build a Simple Agent
+</TabItem>
+<TabItem value="agent" label="Build a Simple Agent">
 Next we can move beyond simple inference and build an agent that can perform tasks using the Llama Stack server.
 #### i. Create the Script
 Create a file `agent.py` and add the following code:
@ -439,9 +429,8 @@ uv run python agent.py

    So, that's me in a nutshell!
 ```
-:::
-
-:::{tab-item} Build a RAG Agent
+</TabItem>
+<TabItem value="rag" label="Build a RAG Agent">

 For our last demo, we can build a RAG agent that can answer questions about the Torchtune project using the documents
 in a vector database.
@ -460,10 +449,12 @@ client = LlamaStackClient(base_url="http://localhost:8321")
 embed_lm = next(m for m in client.models.list() if m.model_type == "embedding")
 embedding_model = embed_lm.identifier
 vector_db_id = f"v{uuid.uuid4().hex}"
-client.vector_dbs.register(
+# The VectorDB API is deprecated; the server now returns its own authoritative ID.
+# We capture the correct ID from the response's .identifier attribute.
+vector_db_id = client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model=embedding_model,
-)
+).identifier

 # Create Documents
 urls = [
@ -542,10 +533,9 @@ uv run python rag_agent.py
    ...
    Overall, DORA is a powerful reinforcement learning algorithm that can learn complex tasks from human demonstrations. However, it requires careful consideration of the challenges and limitations to achieve optimal results.
 ```
-:::
-
-::::
+</TabItem>
+</Tabs>

 **You're Ready to Build Your Own Apps!**

-Congrats! 🥳 Now you're ready to [build your own Llama Stack applications](../building_applications/index)! 🚀
+Congrats! 🥳 Now you're ready to [build your own Llama Stack applications](../building_applications/)! 🚀
--- a/docs/source/getting_started/libraries.md
+++ b/docs/source/getting_started/libraries.md
@ -1,3 +1,9 @@
+---
+description: We have a number of client-side SDKs available for different languages.
+sidebar_label: Libraries
+sidebar_position: 2
+title: Libraries (SDKs)
+---
 ## Libraries (SDKs)

 We have a number of client-side SDKs available for different languages.
@ -7,4 +13,4 @@ We have a number of client-side SDKs available for different languages.
 | Python |  [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/)
 | Swift  | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift/tree/latest-release) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
 | Node   | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
-| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
+| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@ -1,4 +1,9 @@
-## Quickstart
+---
+description: environments.
+sidebar_label: Quickstart
+sidebar_position: 1
+title: Quickstart
+---

 Get started with Llama Stack in minutes!

@ -6,7 +11,7 @@ Llama Stack is a stateful service with REST APIs to support the seamless transit
 environments. You can build and test using a local server first and deploy to a hosted endpoint for production.

 In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)
-as the inference [provider](../providers/inference/index) for a Llama Model.
+as the inference [provider](/docs/providers/inference) for a Llama Model.

 **💡 Notebook Version:** You can also follow this quickstart guide in a Jupyter notebook format: [quick_start.ipynb](https://github.com/meta-llama/llama-stack/blob/main/docs/quick_start.ipynb)

@ -27,8 +32,75 @@ OLLAMA_URL=http://localhost:11434 \
 #### Step 3: Run the demo
 Now open up a new terminal and copy the following script into a file named `demo_script.py`.

-```{literalinclude} ./demo_script.py
-:language: python
+```python title="demo_script.py"
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient
+
+vector_db_id = "my_demo_vector_db"
+client = LlamaStackClient(base_url="http://localhost:8321")
+
+models = client.models.list()
+
+# Select the first LLM and first embedding models
+model_id = next(m for m in models if m.model_type == "llm").identifier
+embedding_model_id = (
+    em := next(m for m in models if m.model_type == "embedding")
+).identifier
+embedding_dimension = em.metadata["embedding_dimension"]
+
+vector_db = client.vector_dbs.register(
+    vector_db_id=vector_db_id,
+    embedding_model=embedding_model_id,
+    embedding_dimension=embedding_dimension,
+    provider_id="faiss",
+)
+vector_db_id = vector_db.identifier
+source = "https://www.paulgraham.com/greatwork.html"
+print("rag_tool> Ingesting document:", source)
+document = RAGDocument(
+    document_id="document_1",
+    content=source,
+    mime_type="text/html",
+    metadata={},
+)
+client.tool_runtime.rag_tool.insert(
+    documents=[document],
+    vector_db_id=vector_db_id,
+    chunk_size_in_tokens=100,
+)
+agent = Agent(
+    client,
+    model=model_id,
+    instructions="You are a helpful assistant",
+    tools=[
+        {
+            "name": "builtin::rag/knowledge_search",
+            "args": {"vector_db_ids": [vector_db_id]},
+        }
+    ],
+)
+
+prompt = "How do you do great work?"
+print("prompt>", prompt)
+
+use_stream = True
+response = agent.create_turn(
+    messages=[{"role": "user", "content": prompt}],
+    session_id=agent.create_session("rag_session"),
+    stream=use_stream,
+)
+
+# Only call `AgentEventLogger().log(response)` for streaming responses.
+if use_stream:
+    for log in AgentEventLogger().log(response):
+        log.print()
+else:
+    print(response)
 ```
 We will use `uv` to run the script
 ```
@ -59,19 +131,19 @@ Ultimately, great work is about making a meaningful contribution and leaving a l
 ```
 Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳

-```{admonition} HuggingFace access
-:class: tip
+:::tip HuggingFace access

 If you are getting a **401 Client Error** from HuggingFace for the **all-MiniLM-L6-v2** model, try setting **HF_TOKEN** to a valid HuggingFace token in your environment
-```
+
+:::

 ### Next Steps

 Now you're ready to dive deeper into Llama Stack!
- Explore the [Detailed Tutorial](./detailed_tutorial.md).
+- Explore the [Detailed Tutorial](./detailed_tutorial).
 - Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).
 - Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).
- Learn about Llama Stack [Concepts](../concepts/index.md).
- Discover how to [Build Llama Stacks](../distributions/index.md).
- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.
+- Learn about Llama Stack [Concepts](/docs/concepts).
+- Discover how to [Build Llama Stacks](/docs/distributions).
+- Refer to our [References](/docs/references) for details on the Llama CLI and Python SDK.
 - Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials.
--- a/docs/docs/index.mdx
+++ b/docs/docs/index.mdx
@ -0,0 +1,101 @@
+---
+sidebar_position: 1
+title: Welcome to Llama Stack
+description: Llama Stack is the open-source framework for building generative AI applications
+sidebar_label: Intro
+tags:
+  - getting-started
+  - overview
+---
+
+# Welcome to Llama Stack
+
+Llama Stack is the open-source framework for building generative AI applications.
+
+:::tip Llama 4 is here!
+
+Check out [Getting Started with Llama 4](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started_llama4.ipynb)
+
+:::
+
+:::tip News
+
+Llama Stack is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases) for more details.
+
+:::
+
+
+## What is Llama Stack?
+
+Llama Stack defines and standardizes the core building blocks needed to bring generative AI applications to market. It provides a unified set of APIs with implementations from leading service providers, enabling seamless transitions between development and production environments. More specifically, it provides:
+
+- **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
+- **Plugin architecture** to support the rich ecosystem of implementations of the different APIs in different environments like local development, on-premises, cloud, and mobile.
+- **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment
+- **Multiple developer interfaces** like CLI and SDKs for Python, Node, iOS, and Android
+- **Standalone applications** as examples for how to build production-grade AI applications with Llama Stack
+
+<img src="/img/llama-stack.png" alt="Llama Stack" width="400px" />
+
+Our goal is to provide pre-packaged implementations (aka "distributions") which can be run in a variety of deployment environments. LlamaStack can assist you in your entire app development lifecycle - start iterating on local, mobile or desktop and seamlessly transition to on-prem or public cloud deployments. At every point in this transition, the same set of APIs and the same developer experience is available.
+
+## How does Llama Stack work?
+
+Llama Stack consists of a server (with multiple pluggable API providers) and Client SDKs meant to be used in your applications. The server can be run in a variety of environments, including local (inline) development, on-premises, and cloud. The client SDKs are available for Python, Swift, Node, and Kotlin.
+
+## Quick Links
+
+- Ready to build? Check out the [Getting Started Guide](https://llama-stack.github.io/getting_started/quickstart) to get started.
+- Want to contribute? See the [Contributing Guide](https://github.com/llamastack/llama-stack/blob/main/CONTRIBUTING.md).
+- Explore [Example Applications](https://github.com/llamastack/llama-stack-apps) built with Llama Stack.
+
+## Rich Ecosystem Support
+
+Llama Stack provides adapters for popular providers across all API categories:
+
+- **Inference**: Meta Reference, Ollama, Fireworks, Together, NVIDIA, vLLM, AWS Bedrock, OpenAI, Anthropic, and more
+- **Vector Databases**: FAISS, Chroma, Milvus, Postgres, Weaviate, Qdrant, and others
+- **Safety**: Llama Guard, Prompt Guard, Code Scanner, AWS Bedrock
+- **Training & Evaluation**: HuggingFace, TorchTune, NVIDIA NEMO
+
+:::info Provider Details
+For complete provider compatibility and setup instructions, see our [Providers Documentation](https://llamastack.github.io/providers/).
+:::
+
+## Get Started Today
+
+<div style={{display: 'flex', gap: '1rem', flexWrap: 'wrap', margin: '2rem 0'}}>
+  <a href="https://llama-stack.github.io/getting_started/quickstart"
+     style={{
+       background: 'var(--ifm-color-primary)',
+       color: 'white',
+       padding: '0.75rem 1.5rem',
+       borderRadius: '0.5rem',
+       textDecoration: 'none',
+       fontWeight: 'bold'
+     }}>
+    🚀 Quick Start Guide
+  </a>
+  <a href="https://github.com/llamastack/llama-stack-apps"
+     style={{
+       border: '2px solid var(--ifm-color-primary)',
+       color: 'var(--ifm-color-primary)',
+       padding: '0.75rem 1.5rem',
+       borderRadius: '0.5rem',
+       textDecoration: 'none',
+       fontWeight: 'bold'
+     }}>
+    📚 Example Apps
+  </a>
+  <a href="https://github.com/llamastack/llama-stack"
+     style={{
+       border: '2px solid #666',
+       color: '#666',
+       padding: '0.75rem 1.5rem',
+       borderRadius: '0.5rem',
+       textDecoration: 'none',
+       fontWeight: 'bold'
+     }}>
+    ⭐ Star on GitHub
+  </a>
+</div>
--- a/docs/source/providers/agents/index.md
+++ b/docs/source/providers/agents/index.md
@ -1,3 +1,16 @@
+---
+description: "Agents API for creating and interacting with agentic systems.
+
+    Main functionalities provided by this API:
+    - Create agents with specific instructions and ability to use tools.
+    - Interactions with agents are grouped into sessions (\"threads\"), and each interaction is called a \"turn\".
+    - Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
+    - Agents can be provided with various shields (see the Safety API for more details).
+    - Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details."
+sidebar_label: Agents
+title: Agents
+---
+
 # Agents

 ## Overview
@ -12,11 +25,3 @@ Agents API for creating and interacting with agentic systems.
    - Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.

 This section contains documentation for all available providers for the **agents** API.
-
-## Providers
-
-```{toctree}
-:maxdepth: 1
-
-inline_meta-reference
-```
--- a/Show more
+++ b/Show more