build: Bump version to 0.1.9

Release candidate 0.1.19rc8
2025-03-29 00:22:07 +00:00 · 2025-03-28 23:53:32 +00:00
841 changed files with 78534 additions and 280283 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -1,6 +0,0 @@
-[run]
-omit =
-    */tests/*
-    */llama_stack/providers/*
-    */llama_stack/templates/*
-    .venv/*
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@

 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning
+* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan @SLR722 @leseb
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -1,8 +1,10 @@
 # What does this PR do?
-<!-- Provide a short summary of what this PR does and why. Link to relevant issues if applicable. -->
+[Provide a short summary of what this PR does and why. Link to relevant issues if applicable.]

-<!-- If resolving an issue, uncomment and update the line below -->
-<!-- Closes #[issue-number] -->
+[//]: # (If resolving an issue, uncomment and update the line below)
+[//]: # (Closes #[issue-number])

 ## Test Plan
-<!-- Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.* -->
+[Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.*]
+
+[//]: # (## Documentation)
--- a/.github/TRIAGERS.md
+++ b/.github/TRIAGERS.md
@ -1,2 +1,2 @@
 # This file documents Triage members in the Llama Stack community
- @bbrowning @booxter @franciscojavierarceo @leseb
+@franciscojavierarceo @leseb
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@ -1,26 +0,0 @@
-name: Setup Ollama
-description: Start Ollama and cache model
-inputs:
-  models:
-    description: Comma-separated list of models to pull
-    default: "llama3.2:3b-instruct-fp16,all-minilm:latest"
-runs:
-  using: "composite"
-  steps:
-    - name: Install and start Ollama
-      shell: bash
-      run: |
-        # the ollama installer also starts the ollama service
-        curl -fsSL https://ollama.com/install.sh | sh
-
-    # Do NOT cache models - pulling the cache is actually slower than just pulling the model.
-    # It takes ~45 seconds to pull the models from the cache and unpack it, but only 30 seconds to
-    # pull them directly.
-    # Maybe this is because the cache is being pulled at the same time by all the matrix jobs?
-    - name: Pull requested models
-      if: inputs.models != ''
-      shell: bash
-      run: |
-        for model in $(echo "${{ inputs.models }}" | tr ',' ' '); do
-          ollama pull "$model"
-        done
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -1,22 +0,0 @@
-name: Setup runner
-description: Prepare a runner for the tests (install uv, python, project dependencies, etc.)
-runs:
-  using: "composite"
-  steps:
-    - name: Install uv
-      uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
-      with:
-        python-version: "3.10"
-        activate-environment: true
-        version: 0.7.6
-
-    - name: Install dependencies
-      shell: bash
-      run: |
-        uv sync --all-groups
-        uv pip install ollama faiss-cpu
-        # always test against the latest version of the client
-        # TODO: this is not necessarily a good idea. we need to test against both published and latest
-        # to find out backwards compatibility issues.
-        uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
-        uv pip install -e .
--- a/.github/workflows/Dockerfile
+++ b/.github/workflows/Dockerfile
@ -1 +0,0 @@
-FROM localhost:5000/distribution-kvant:dev
--- a/.github/workflows_upstream/changelog.yml
+++ b/.github/workflows_upstream/changelog.yml
@ -15,13 +15,13 @@ jobs:
      pull-requests: write  # for peter-evans/create-pull-request to create a PR
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@v4
        with:
          ref: main
          fetch-depth: 0
      - run: |
          python ./scripts/gen-changelog.py
-      - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
+      - uses: peter-evans/create-pull-request@v7
        with:
          title: 'docs: update CHANGELOG.md for ${{ github.ref_name }}'
          commit-message: 'docs: update CHANGELOG.md for ${{ github.ref_name }}'
--- a/.github/workflows/ci-playground.yaml
+++ b/.github/workflows/ci-playground.yaml
@ -1,73 +0,0 @@
-name: Build and Push playground container
-run-name: Build and Push playground container
-on:
-  workflow_dispatch:
-  #schedule:
-  #  - cron: "0 10 * * *"
-  push:
-    branches:
-      - main
-      - kvant
-    tags:
-      - 'v*'
-  pull_request:
-    branches:
-      - main
-      - kvant
-env:
-  IMAGE: git.kvant.cloud/${{github.repository}}-playground
-jobs:
-  build-playground:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Set current time
-        uses: https://github.com/gerred/actions/current-time@master
-        id: current_time
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Login to git.kvant.cloud registry
-        uses: docker/login-action@v3
-        with:
-          registry: git.kvant.cloud
-          username: ${{ vars.ORG_PACKAGE_WRITER_USERNAME }}
-          password: ${{ secrets.ORG_PACKAGE_WRITER_TOKEN }}
-      
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          # list of Docker images to use as base name for tags
-          images: |
-            ${{env.IMAGE}}
-          # generate Docker tags based on the following events/attributes
-          tags: |
-            type=schedule
-            type=ref,event=branch
-            type=ref,event=pr
-            type=ref,event=tag
-            type=semver,pattern={{version}}
-
-      - name: Build and push to gitea registry
-        uses: docker/build-push-action@v6
-        with:
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-          context: .
-          file: llama_stack/distribution/ui/Containerfile
-          provenance: mode=max
-          sbom: true
-          build-args: |
-            BUILD_DATE=${{ steps.current_time.outputs.time }}
-          cache-from: |
-            type=registry,ref=${{ env.IMAGE }}:buildcache
-            type=registry,ref=${{ env.IMAGE }}:${{ github.ref_name }}
-            type=registry,ref=${{ env.IMAGE }}:main
-          cache-to: type=registry,ref=${{ env.IMAGE }}:buildcache,mode=max,image-manifest=true
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@ -1,98 +0,0 @@
-name: Build and Push container
-run-name: Build and Push container
-on:
-  workflow_dispatch:
-  #schedule:
-  #  - cron: "0 10 * * *"
-  push:
-    branches:
-      - main
-      - kvant
-    tags:
-      - 'v*'
-  pull_request:
-    branches:
-      - main
-      - kvant
-env:
-  IMAGE: git.kvant.cloud/${{github.repository}}
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    services:
-      registry:
-        image: registry:2
-        ports:
-          - 5000:5000
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Set current time
-        uses: https://github.com/gerred/actions/current-time@master
-        id: current_time
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-        with:
-          driver-opts: network=host
-
-      - name: Login to git.kvant.cloud registry
-        uses: docker/login-action@v3
-        with:
-          registry: git.kvant.cloud
-          username: ${{ vars.ORG_PACKAGE_WRITER_USERNAME }}
-          password: ${{ secrets.ORG_PACKAGE_WRITER_TOKEN }}
-      
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          # list of Docker images to use as base name for tags
-          images: |
-            ${{env.IMAGE}}
-          # generate Docker tags based on the following events/attributes
-          tags: |
-            type=schedule
-            type=ref,event=branch
-            type=ref,event=pr
-            type=ref,event=tag
-            type=semver,pattern={{version}}
-
-      - name: Install uv
-        uses: https://github.com/astral-sh/setup-uv@v5
-        with:
-          # Install a specific version of uv.
-          version: "0.7.8"
-            
-      - name: Build
-        env:
-          USE_COPY_NOT_MOUNT: true
-          LLAMA_STACK_DIR: .
-        run: |
-          uvx --from . llama stack build --template kvant --image-type container
-
-          # docker tag distribution-kvant:dev ${{env.IMAGE}}:kvant
-          # docker push ${{env.IMAGE}}:kvant
-
-          docker tag distribution-kvant:dev localhost:5000/distribution-kvant:dev
-          docker push localhost:5000/distribution-kvant:dev
-
-      - name: Build and push to gitea registry
-        uses: docker/build-push-action@v6
-        with:
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-          context: .github/workflows
-          provenance: mode=max
-          sbom: true
-          build-args: |
-            BUILD_DATE=${{ steps.current_time.outputs.time }}
-          cache-from: |
-            type=registry,ref=${{ env.IMAGE }}:buildcache
-            type=registry,ref=${{ env.IMAGE }}:${{ github.ref_name }}
-            type=registry,ref=${{ env.IMAGE }}:main
-          cache-to: type=registry,ref=${{ env.IMAGE }}:buildcache,mode=max,image-manifest=true
--- a/.github/workflows_upstream/gha_workflow_llama_stack_tests.yml
+++ b/.github/workflows_upstream/gha_workflow_llama_stack_tests.yml
@ -140,7 +140,7 @@ jobs:
      #######################
      - name: "Checkout 'meta-llama/llama-stack' repository"
        id: checkout_repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@v4
        with:
          ref: ${{ inputs.branch }}

@ -302,7 +302,7 @@ jobs:
      - name: "PR - Test Summary"
        id: pr_test_summary_create
        if: github.event_name == 'pull_request_target'
-        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
+        uses: test-summary/action@v2
        with:
          paths: "${{ github.workspace }}/merged-test-results.xml"
          output: test-summary.md
@ -310,7 +310,7 @@ jobs:
      - name: "PR - Upload Test Summary"
        id: pr_test_summary_upload
        if: github.event_name == 'pull_request_target'
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@v4
        with:
          name: test-summary
          path: test-summary.md
@ -320,7 +320,7 @@ jobs:
      - name: "PR - Update comment"
        id: pr_update_comment
        if: github.event_name == 'pull_request_target'
-        uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b # v3.0.1
+        uses: thollander/actions-comment-pull-request@v3
        with:
          filePath: test-summary.md

@ -350,6 +350,6 @@ jobs:
      - name: "Manual - Test Summary"
        id: manual_test_summary
        if: always() && github.event_name == 'workflow_dispatch'
-        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
+        uses: test-summary/action@v2
        with:
          paths: "${{ github.workspace }}/merged-test-results.xml"
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -0,0 +1,101 @@
+name: Integration Tests
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'distributions/**'
+      - 'llama_stack/**'
+      - 'tests/integration/**'
+      - 'uv.lock'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - '.github/workflows/integration-tests.yml' # This workflow
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test-matrix:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        # Listing tests manually since some of them currently fail
+        # TODO: generate matrix list from tests/integration when fixed
+        test-type: [inference, datasets, inspect, scoring, post_training, providers]
+      fail-fast: false # we want to run all tests regardless of failure
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install Ollama
+        run: |
+          curl -fsSL https://ollama.com/install.sh | sh
+
+      - name: Pull Ollama image
+        run: |
+          ollama pull llama3.2:3b-instruct-fp16
+
+      - name: Start Ollama in background
+        run: |
+          nohup ollama run llama3.2:3b-instruct-fp16 > ollama.log 2>&1 &
+
+      - name: Set Up Environment and Install Dependencies
+        run: |
+          uv sync --extra dev --extra test
+          uv pip install ollama faiss-cpu
+          # always test against the latest version of the client
+          uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
+          uv pip install -e .
+          llama stack build --template ollama --image-type venv
+
+      - name: Wait for Ollama to start
+        run: |
+          echo "Waiting for Ollama..."
+          for i in {1..30}; do
+            if curl -s http://localhost:11434 | grep -q "Ollama is running"; then
+              echo "Ollama is running!"
+              exit 0
+            fi
+            sleep 1
+          done
+          echo "Ollama failed to start"
+          ollama ps
+          ollama.log
+          exit 1
+
+      - name: Start Llama Stack server in background
+        env:
+          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+        run: |
+          source .venv/bin/activate
+          nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv > server.log 2>&1 &
+
+      - name: Wait for Llama Stack server to be ready
+        run: |
+          echo "Waiting for Llama Stack server..."
+          for i in {1..30}; do
+            if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
+              echo "Llama Stack server is up!"
+              exit 0
+            fi
+            sleep 1
+          done
+          echo "Llama Stack server failed to start"
+          cat server.log
+          exit 1
+
+      - name: Run Integration Tests
+        env:
+          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+        run: |
+          uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=ollama --text-model="meta-llama/Llama-3.2-3B-Instruct" --embedding-model=all-MiniLM-L6-v2
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -0,0 +1,33 @@
+name: Pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: pip
+          cache-dependency-path: |
+            **/requirements*.txt
+            .pre-commit-config.yaml
+
+      - uses: pre-commit/action@v3.0.1
+
+      - name: Verify if there are any diff files after pre-commit
+        run: |
+          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -0,0 +1,83 @@
+name: Test Llama Stack Build
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'llama_stack/cli/stack/build.py'
+      - 'llama_stack/cli/stack/_build.py'
+      - 'llama_stack/distribution/build.*'
+      - 'llama_stack/distribution/*.sh'
+      - '.github/workflows/providers-build.yml'
+  pull_request:
+    paths:
+      - 'llama_stack/cli/stack/build.py'
+      - 'llama_stack/cli/stack/_build.py'
+      - 'llama_stack/distribution/build.*'
+      - 'llama_stack/distribution/*.sh'
+      - '.github/workflows/providers-build.yml'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  generate-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      templates: ${{ steps.set-matrix.outputs.templates }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Generate Template List
+        id: set-matrix
+        run: |
+          templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
+          echo "templates=$templates" >> "$GITHUB_OUTPUT"
+
+  build:
+    needs: generate-matrix
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
+        image-type: [venv, container]
+      fail-fast: false # We want to run all jobs even if some fail
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install LlamaStack
+        run: |
+          uv venv
+          source .venv/bin/activate
+          uv pip install -e .
+
+      - name: Print build dependencies
+        run: |
+          uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
+
+      - name: Run Llama Stack Build
+        run: |
+          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
+          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test
+
+      - name: Print dependencies in the image
+        if: matrix.image-type == 'venv'
+        run: |
+          source test/bin/activate
+          uv pip list
--- a/.github/workflows_upstream/semantic-pr.yml
+++ b/.github/workflows_upstream/semantic-pr.yml
@ -20,6 +20,6 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Check PR Title's semantic conformance
-        uses: amannn/action-semantic-pull-request@0723387faaf9b38adef4775cd42cfd5155ed6017 # v5.5.3
+        uses: amannn/action-semantic-pull-request@v5
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows_upstream/stale_bot.yml
+++ b/.github/workflows_upstream/stale_bot.yml
@ -22,7 +22,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Stale Action
-        uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
+        uses: actions/stale@v9
        with:
          stale-issue-label: 'stale'
          stale-issue-message: >
--- a/.github/workflows_upstream/tests.yml
+++ b/.github/workflows_upstream/tests.yml
@ -20,7 +20,7 @@ jobs:
      matrix:
        provider: [fireworks, together]
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@v4
        with:
          ref: ${{ github.event.inputs.commit_sha }}

--- a/.github/workflows_upstream/unit-tests.yml
+++ b/.github/workflows_upstream/unit-tests.yml
@ -6,6 +6,7 @@ on:
  pull_request:
    branches: [ main ]
    paths:
+      - 'distributions/**'
      - 'llama_stack/**'
      - 'tests/unit/**'
      - 'uv.lock'
@ -30,11 +31,17 @@ jobs:
          - "3.12"
          - "3.13"
    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@v4

-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
+      - name: Set up Python ${{ matrix.python }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+
+      - uses: astral-sh/setup-uv@v5
+        with:
+          python-version: ${{ matrix.python }}
+          enable-cache: false

      - name: Run unit tests
        run: |
@ -42,7 +49,7 @@ jobs:

      - name: Upload test results
        if: always()
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@v4
        with:
          name: test-results-${{ matrix.python }}
          path: |
--- a/.github/workflows_upstream/update-readthedocs.yml
+++ b/.github/workflows_upstream/update-readthedocs.yml
@ -14,8 +14,6 @@ on:
      - 'docs/**'
      - 'pyproject.toml'
      - '.github/workflows/update-readthedocs.yml'
-    tags:
-      - '*'
  pull_request:
    branches:
      - main
@ -35,10 +33,18 @@ jobs:
      TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@v4

-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install the latest version of uv
+        uses: astral-sh/setup-uv@v5
+
+      - name: Sync with uv
+        run: uv sync --extra docs

      - name: Build HTML
        run: |
@ -55,10 +61,7 @@ jobs:

          response=$(curl -X POST \
            -H "Content-Type: application/json" \
-            -d "{
-              \"token\": \"$TOKEN\",
-              \"version\": \"$GITHUB_REF_NAME\"
-            }" \
+            -d "{\"token\": \"$TOKEN\"}" \
            https://readthedocs.org/api/v2/webhook/llama-stack/289768/)

          echo "Response: $response"
--- a/.github/workflows_upstream/install-script-ci.yml
+++ b/.github/workflows_upstream/install-script-ci.yml
@ -1,26 +0,0 @@
-name: Installer CI
-
-on:
-  pull_request:
-    paths:
-      - 'install.sh'
-  push:
-    paths:
-      - 'install.sh'
-  schedule:
-    - cron: '0 2 * * *'  # every day at 02:00 UTC
-
-jobs:
-  lint:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
-      - name: Run ShellCheck on install.sh
-        run: shellcheck install.sh
-  smoke-test:
-    needs: lint
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
-      - name: Run installer end-to-end
-        run: ./install.sh
--- a/.github/workflows_upstream/integration-auth-tests.yml
+++ b/.github/workflows_upstream/integration-auth-tests.yml
@ -1,132 +0,0 @@
-name: Integration Auth Tests
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'distributions/**'
-      - 'llama_stack/**'
-      - 'tests/integration/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - 'requirements.txt'
-      - '.github/workflows/integration-auth-tests.yml' # This workflow
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  test-matrix:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        auth-provider: [oauth2_token]
-      fail-fast: false # we want to run all tests regardless of failure
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Build Llama Stack
-        run: |
-          llama stack build --template ollama --image-type venv
-
-      - name: Install minikube
-        if: ${{ matrix.auth-provider == 'kubernetes' }}
-        uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # v0.0.19
-
-      - name: Start minikube
-        if: ${{ matrix.auth-provider == 'oauth2_token' }}
-        run: |
-          minikube start
-          kubectl get pods -A
-
-      - name: Configure Kube Auth
-        if: ${{ matrix.auth-provider == 'oauth2_token' }}
-        run: |
-          kubectl create namespace llama-stack
-          kubectl create serviceaccount llama-stack-auth -n llama-stack
-          kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --serviceaccount=llama-stack:llama-stack-auth -n llama-stack
-          kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
-          cat <<EOF | kubectl apply -f -
-          apiVersion: rbac.authorization.k8s.io/v1
-          kind: ClusterRole
-          metadata:
-            name: allow-anonymous-openid
-          rules:
-          - nonResourceURLs: ["/openid/v1/jwks"]
-            verbs: ["get"]
-          ---
-          apiVersion: rbac.authorization.k8s.io/v1
-          kind: ClusterRoleBinding
-          metadata:
-            name: allow-anonymous-openid
-          roleRef:
-            apiGroup: rbac.authorization.k8s.io
-            kind: ClusterRole
-            name: allow-anonymous-openid
-          subjects:
-          - kind: User
-            name: system:anonymous
-            apiGroup: rbac.authorization.k8s.io
-          EOF
-
-      - name: Set Kubernetes Config
-        if: ${{ matrix.auth-provider == 'oauth2_token' }}
-        run: |
-          echo "KUBERNETES_API_SERVER_URL=$(kubectl get --raw /.well-known/openid-configuration| jq -r .jwks_uri)" >> $GITHUB_ENV
-          echo "KUBERNETES_CA_CERT_PATH=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}')" >> $GITHUB_ENV
-          echo "KUBERNETES_ISSUER=$(kubectl get --raw /.well-known/openid-configuration| jq -r .issuer)" >> $GITHUB_ENV
-          echo "KUBERNETES_AUDIENCE=$(kubectl create token llama-stack-auth -n llama-stack --duration=1h | cut -d. -f2 | base64 -d | jq -r '.aud[0]')" >> $GITHUB_ENV
-
-      - name: Set Kube Auth Config and run server
-        env:
-          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
-        if: ${{ matrix.auth-provider == 'oauth2_token' }}
-        run: |
-          run_dir=$(mktemp -d)
-          cat <<'EOF' > $run_dir/run.yaml
-          version: '2'
-          image_name: kube
-          apis: []
-          providers: {}
-          server:
-            port: 8321
-          EOF
-          yq eval '.server.auth = {"provider_type": "${{ matrix.auth-provider }}"}' -i $run_dir/run.yaml
-          yq eval '.server.auth.config = {"tls_cafile": "${{ env.KUBERNETES_CA_CERT_PATH }}", "issuer": "${{ env.KUBERNETES_ISSUER }}", "audience": "${{ env.KUBERNETES_AUDIENCE }}"}' -i $run_dir/run.yaml
-          yq eval '.server.auth.config.jwks = {"uri": "${{ env.KUBERNETES_API_SERVER_URL }}"}' -i $run_dir/run.yaml
-          cat $run_dir/run.yaml
-
-          nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &
-
-      - name: Wait for Llama Stack server to be ready
-        run: |
-          echo "Waiting for Llama Stack server..."
-          for i in {1..30}; do
-            if curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://localhost:8321/v1/health | grep -q "OK"; then
-              echo "Llama Stack server is up!"
-              if grep -q "Enabling authentication with provider: ${{ matrix.auth-provider }}" server.log; then
-                echo "Llama Stack server is configured to use ${{ matrix.auth-provider }} auth"
-                exit 0
-              else
-                echo "Llama Stack server is not configured to use ${{ matrix.auth-provider }} auth"
-                cat server.log
-                exit 1
-              fi
-            fi
-            sleep 1
-          done
-          echo "Llama Stack server failed to start"
-          cat server.log
-          exit 1
-
-      - name: Test auth
-        run: |
-          curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers|jq
--- a/.github/workflows_upstream/integration-tests.yml
+++ b/.github/workflows_upstream/integration-tests.yml
@ -1,116 +0,0 @@
-name: Integration Tests
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'llama_stack/**'
-      - 'tests/integration/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - 'requirements.txt'
-      - '.github/workflows/integration-tests.yml' # This workflow
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  test-matrix:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        # Listing tests manually since some of them currently fail
-        # TODO: generate matrix list from tests/integration when fixed
-        test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime]
-        client-type: [library, http]
-      fail-fast: false # we want to run all tests regardless of failure
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Setup ollama
-        uses: ./.github/actions/setup-ollama
-
-      - name: Build Llama Stack
-        run: |
-          llama stack build --template ollama --image-type venv
-
-      - name: Start Llama Stack server in background
-        if: matrix.client-type == 'http'
-        env:
-          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
-        run: |
-          LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv &
-
-      - name: Wait for Llama Stack server to be ready
-        if: matrix.client-type == 'http'
-        run: |
-          echo "Waiting for Llama Stack server..."
-          for i in {1..30}; do
-            if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
-              echo "Llama Stack server is up!"
-              exit 0
-            fi
-            sleep 1
-          done
-          echo "Llama Stack server failed to start"
-          cat server.log
-          exit 1
-
-      - name: Verify Ollama status is OK
-        if: matrix.client-type == 'http'
-        run: |
-          echo "Verifying Ollama status..."
-          ollama_status=$(curl -s -L http://127.0.0.1:8321/v1/providers/ollama|jq --raw-output .health.status)
-          echo "Ollama status: $ollama_status"
-          if [ "$ollama_status" != "OK" ]; then
-            echo "Ollama health check failed"
-            exit 1
-          fi
-
-      - name: Check Storage and Memory Available Before Tests
-        if: ${{ always() }}
-        run: |
-          free -h
-          df -h
-
-      - name: Run Integration Tests
-        env:
-          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
-        run: |
-          if [ "${{ matrix.client-type }}" == "library" ]; then
-            stack_config="ollama"
-          else
-            stack_config="http://localhost:8321"
-          fi
-          uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
-            -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
-            --text-model="meta-llama/Llama-3.2-3B-Instruct" \
-            --embedding-model=all-MiniLM-L6-v2
-
-      - name: Check Storage and Memory Available After Tests
-        if: ${{ always() }}
-        run: |
-          free -h
-          df -h
-
-      - name: Write ollama logs to file
-        if: ${{ always() }}
-        run: |
-          sudo journalctl -u ollama.service > ollama.log
-
-      - name: Upload all logs to artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        with:
-          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}
-          path: |
-            *.log
-          retention-days: 1
--- a/.github/workflows_upstream/pre-commit.yml
+++ b/.github/workflows_upstream/pre-commit.yml
@ -1,45 +0,0 @@
-name: Pre-commit
-
-on:
-  pull_request:
-  push:
-    branches: [main]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  pre-commit:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Set up Python
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: '3.11'
-          cache: pip
-          cache-dependency-path: |
-            **/requirements*.txt
-            .pre-commit-config.yaml
-
-      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
-        env:
-          SKIP: no-commit-to-branch
-          RUFF_OUTPUT_FORMAT: github
-
-      - name: Verify if there are any diff files after pre-commit
-        run: |
-          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
-
-      - name: Verify if there are any new files after pre-commit
-        run: |
-          unstaged_files=$(git ls-files --others --exclude-standard)
-          if [ -n "$unstaged_files" ]; then
-            echo "There are uncommitted new files, run pre-commit locally and commit again"
-            echo "$unstaged_files"
-            exit 1
-          fi
--- a/.github/workflows_upstream/providers-build.yml
+++ b/.github/workflows_upstream/providers-build.yml
@ -1,147 +0,0 @@
-name: Test Llama Stack Build
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - 'llama_stack/cli/stack/build.py'
-      - 'llama_stack/cli/stack/_build.py'
-      - 'llama_stack/distribution/build.*'
-      - 'llama_stack/distribution/*.sh'
-      - '.github/workflows/providers-build.yml'
-  pull_request:
-    paths:
-      - 'llama_stack/cli/stack/build.py'
-      - 'llama_stack/cli/stack/_build.py'
-      - 'llama_stack/distribution/build.*'
-      - 'llama_stack/distribution/*.sh'
-      - '.github/workflows/providers-build.yml'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  generate-matrix:
-    runs-on: ubuntu-latest
-    outputs:
-      templates: ${{ steps.set-matrix.outputs.templates }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Generate Template List
-        id: set-matrix
-        run: |
-          templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
-          echo "templates=$templates" >> "$GITHUB_OUTPUT"
-
-  build:
-    needs: generate-matrix
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
-        image-type: [venv, container]
-      fail-fast: false # We want to run all jobs even if some fail
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Print build dependencies
-        run: |
-          uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
-
-      - name: Run Llama Stack Build
-        run: |
-          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
-          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test
-
-      - name: Print dependencies in the image
-        if: matrix.image-type == 'venv'
-        run: |
-          uv pip list
-
-  build-single-provider:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Build a single provider
-        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --image-type venv --image-name test --providers inference=remote::ollama
-
-  build-custom-container-distribution:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Build a single provider
-        run: |
-          yq -i '.image_type = "container"' llama_stack/templates/starter/build.yaml
-          yq -i '.image_name = "test"' llama_stack/templates/starter/build.yaml
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/starter/build.yaml
-
-      - name: Inspect the container image entrypoint
-        run: |
-          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
-          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
-          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
-            echo "Entrypoint is not correct"
-            exit 1
-          fi
-
-  build-ubi9-container-distribution:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Pin template to UBI9 base
-        run: |
-          yq -i '
-            .image_type    = "container" |
-            .image_name    = "ubi9-test" |
-            .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
-          ' llama_stack/templates/starter/build.yaml
-
-      - name: Build dev container (UBI9)
-        env:
-          USE_COPY_NOT_MOUNT: "true"
-          LLAMA_STACK_DIR: "."
-        run: |
-          uv run llama stack build --config llama_stack/templates/starter/build.yaml
-
-      - name: Inspect UBI9 image
-        run: |
-          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
-          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
-          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
-            echo "Entrypoint is not correct"
-            exit 1
-          fi
-
-          echo "Checking /etc/os-release in $IMAGE_ID"
-          docker run --rm --entrypoint sh "$IMAGE_ID" -c \
-              'source /etc/os-release && echo "$ID"' \
-              | grep -qE '^(rhel|ubi)$' \
-              || { echo "Base image is not UBI 9!"; exit 1; }
--- a/.github/workflows_upstream/test-external-providers.yml
+++ b/.github/workflows_upstream/test-external-providers.yml
@ -1,71 +0,0 @@
-name: Test External Providers
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'llama_stack/**'
-      - 'tests/integration/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - 'requirements.txt'
-      - '.github/workflows/test-external-providers.yml' # This workflow
-
-jobs:
-  test-external-providers:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        image-type: [venv]
-        # We don't do container yet, it's tricky to install a package from the host into the
-        # container and point 'uv pip install' to the correct path...
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Install dependencies
-        uses: ./.github/actions/setup-runner
-
-      - name: Apply image type to config file
-        run: |
-          yq -i '.image_type = "${{ matrix.image-type }}"' tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
-          cat tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
-
-      - name: Setup directory for Ollama custom provider
-        run: |
-          mkdir -p tests/external-provider/llama-stack-provider-ollama/src/
-          cp -a llama_stack/providers/remote/inference/ollama/ tests/external-provider/llama-stack-provider-ollama/src/llama_stack_provider_ollama
-
-      - name: Create provider configuration
-        run: |
-          mkdir -p /home/runner/.llama/providers.d/remote/inference
-          cp tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml
-
-      - name: Build distro from config file
-        run: |
-          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
-
-      - name: Start Llama Stack server in background
-        if: ${{ matrix.image-type }} == 'venv'
-        env:
-          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
-        run: |
-          uv run pip list
-          nohup uv run --active llama stack run tests/external-provider/llama-stack-provider-ollama/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
-
-      - name: Wait for Llama Stack server to be ready
-        run: |
-          for i in {1..30}; do
-            if ! grep -q "remote::custom_ollama from /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml" server.log; then
-              echo "Waiting for Llama Stack server to load the provider..."
-              sleep 1
-            else
-              echo "Provider loaded"
-              exit 0
-            fi
-          done
-          echo "Provider failed to load"
-          cat server.log
-          exit 1
--- a/.gitignore
+++ b/.gitignore
@ -6,7 +6,6 @@ dev_requirements.txt
 build
 .DS_Store
 llama_stack/configs/*
-.cursor/
 xcuserdata/
 *.hmap
 .DS_Store
@ -24,4 +23,3 @@ venv/
 pytest-report.xml
 .coverage
 .python-version
-data
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -15,18 +15,6 @@ repos:
        args: ['--maxkb=1000']
    -   id: end-of-file-fixer
        exclude: '^(.*\.svg)$'
-    -   id: no-commit-to-branch
-    -   id: check-yaml
-        args: ["--unsafe"]
-    -   id: detect-private-key
-    -   id: requirements-txt-fixer
-    -   id: mixed-line-ending
-        args: [--fix=lf] # Forces to replace line ending by LF (line feed)
-    -   id: check-executables-have-shebangs
-    -   id: check-json
-    -   id: check-shebang-scripts-are-executable
-    -   id: check-symlinks
-    -   id: check-toml

 -   repo: https://github.com/Lucas-C/pre-commit-hooks
    rev: v1.5.4
@ -53,7 +41,7 @@ repos:
        - black==24.3.0

 -   repo: https://github.com/astral-sh/uv-pre-commit
-    rev: 0.7.8
+    rev: 0.6.3
    hooks:
    -   id: uv-lock
    -   id: uv-export
@ -61,7 +49,6 @@ repos:
            "--frozen",
            "--no-hashes",
            "--no-emit-project",
-            "--no-default-groups",
            "--output-file=requirements.txt"
        ]

@ -89,29 +76,24 @@ repos:
      - id: distro-codegen
        name: Distribution Template Codegen
        additional_dependencies:
-          - uv==0.7.8
-        entry: uv run --group codegen ./scripts/distro_codegen.py
+          - uv==0.6.0
+        entry: uv run --extra codegen ./scripts/distro_codegen.py
        language: python
        pass_filenames: false
        require_serial: true
        files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
+
+-   repo: local
+    hooks:
      - id: openapi-codegen
        name: API Spec Codegen
        additional_dependencies:
-          - uv==0.7.8
-        entry: sh -c 'uv run ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
+          - uv==0.6.2
+        entry: sh -c 'uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
        language: python
        pass_filenames: false
        require_serial: true
        files: ^llama_stack/apis/|^docs/openapi_generator/
-      - id: check-workflows-use-hashes
-        name: Check GitHub Actions use SHA-pinned actions
-        entry: ./scripts/check-workflows-use-hashes.sh
-        language: system
-        pass_filenames: false
-        require_serial: true
-        always_run: true
-        files: ^\.github/workflows/.*\.ya?ml$

 ci:
    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -5,21 +5,28 @@
 # Required
 version: 2

-# Build documentation in the "docs/" directory with Sphinx
-sphinx:
-  configuration: docs/source/conf.py
-
 # Set the OS, Python version and other tools you might need
 build:
  os: ubuntu-22.04
  tools:
    python: "3.12"
-  jobs:
-    pre_create_environment:
-      - asdf plugin add uv
-      - asdf install uv latest
-      - asdf global uv latest
-    create_environment:
-      - uv venv "${READTHEDOCS_VIRTUALENV_PATH}"
-    install:
-      - UV_PROJECT_ENVIRONMENT="${READTHEDOCS_VIRTUALENV_PATH}" uv sync --frozen --group docs
+    # You can also specify other tool versions:
+    # nodejs: "19"
+    # rust: "1.64"
+    # golang: "1.19"
+
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: docs/source/conf.py
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#    - pdf
+#    - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+   install:
+   - requirements: docs/requirements.txt
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,112 +1,5 @@
 # Changelog

-# v0.2.7
-Published on: 2025-05-16T20:38:10Z
-
-## Highlights
-
-This is a small update. But a couple highlights:
-
-* feat: function tools in OpenAI Responses by @bbrowning in https://github.com/meta-llama/llama-stack/pull/2094, getting closer to ready. Streaming is the next missing piece.
-* feat: Adding support for customizing chunk context in RAG insertion and querying by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/2134
-* feat: scaffolding for Llama Stack UI by @ehhuang in https://github.com/meta-llama/llama-stack/pull/2149, more to come in the coming releases.
-
-
---
-
-# v0.2.6
-Published on: 2025-05-12T18:06:52Z
-
-
-
---
-
-# v0.2.5
-Published on: 2025-05-04T20:16:49Z
-
-
-
---
-
-# v0.2.4
-Published on: 2025-04-29T17:26:01Z
-
-## Highlights
-
-* One-liner to install and run Llama Stack yay! by @reluctantfuturist in https://github.com/meta-llama/llama-stack/pull/1383
-* support for NVIDIA NeMo datastore by @raspawar in https://github.com/meta-llama/llama-stack/pull/1852
-* (yuge!) Kubernetes authentication by @leseb in https://github.com/meta-llama/llama-stack/pull/1778
-* (yuge!) OpenAI Responses API by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1989
-* add api.llama provider, llama-guard-4 model by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2058
-
-
---
-
-# v0.2.3
-Published on: 2025-04-25T22:46:21Z
-
-## Highlights
-
-* OpenAI compatible inference endpoints and client-SDK support. `client.chat.completions.create()` now works.
-* significant improvements and functionality added to the nVIDIA distribution
-* many improvements to the test verification suite.
-* new inference providers: Ramalama, IBM WatsonX
-* many improvements to the Playground UI
-
-
---
-
-# v0.2.2
-Published on: 2025-04-13T01:19:49Z
-
-## Main changes
-
- Bring Your Own Provider (@leseb) - use out-of-tree provider code to execute the distribution server
- OpenAI compatible inference API in progress (@bbrowning)
- Provider verifications (@ehhuang)
- Many updates and fixes to playground
- Several llama4 related fixes
-
-
---
-
-# v0.2.1
-Published on: 2025-04-05T23:13:00Z
-
-
-
---
-
-# v0.2.0
-Published on: 2025-04-05T19:04:29Z
-
-## Llama 4 Support
-
-Checkout more at https://www.llama.com
-
-
-
---
-
-# v0.1.9
-Published on: 2025-03-29T00:52:23Z
-
-### Build and Test Agents
-* Agents: Entire document context with attachments
-* RAG: Documentation with sqlite-vec faiss comparison
-* Getting started: Fixes to getting started notebook.
-
-### Agent Evals and Model Customization
-* (**New**) Post-training: Add nemo customizer
-
-### Better Engineering
-* Moved sqlite-vec to non-blocking calls
-* Don't return a payload on file delete
-
-
-
---
-
 # v0.1.8
 Published on: 2025-03-24T01:28:50Z

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -88,7 +88,7 @@ BRAVE_SEARCH_API_KEY=

 And then use this dotenv file when running client SDK tests via the following:
 ```bash
-uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
+uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py
 ```

 ## Pre-commit Hooks
@ -110,9 +110,21 @@ uv run pre-commit run --all-files
 > [!CAUTION]
 > Before pushing your changes, make sure that the pre-commit hooks have passed successfully.

-## Running tests
+## Running unit tests

-You can find the Llama Stack testing documentation here [here](tests/README.md).
+You can run the unit tests by running:
+
+```bash
+source .venv/bin/activate
+./scripts/unit-tests.sh
+```
+
+If you'd like to run for a non-default version of Python (currently 3.10), pass `PYTHON_VERSION` variable as follows:
+
+```
+source .venv/bin/activate
+PYTHON_VERSION=3.13 ./scripts/unit-tests.sh
+```

 ## Adding a new dependency to the project

@ -125,20 +137,11 @@ uv sync

 ## Coding Style

-* Comments should provide meaningful insights into the code. Avoid filler comments that simply
-  describe the next step, as they create unnecessary clutter, same goes for docstrings.
-* Prefer comments to clarify surprising behavior and/or relationships between parts of the code
-  rather than explain what the next line of code does.
-* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like
-  `Exception`.
+* Comments should provide meaningful insights into the code. Avoid filler comments that simply describe the next step, as they create unnecessary clutter, same goes for docstrings.
+* Prefer comments to clarify surprising behavior and/or relationships between parts of the code rather than explain what the next line of code does.
+* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like `Exception`.
 * Error messages should be prefixed with "Failed to ..."
-* 4 spaces for indentation rather than tab
-* When using `# noqa` to suppress a style or linter warning, include a comment explaining the
-  justification for bypassing the check.
-* When using `# type: ignore` to suppress a mypy warning, include a comment explaining the
-  justification for bypassing the check.
-* Don't use unicode characters in the codebase. ASCII-only is preferred for compatibility or
-  readability reasons.
+* 4 spaces for indentation rather than tabs

 ## Common Tasks

@ -167,11 +170,14 @@ If you have made changes to a provider's configuration in any form (introducing
 If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.

 ```bash
+cd docs
+uv sync --extra docs
+
 # This rebuilds the documentation pages.
-uv run --group docs make -C docs/ html
+uv run make html

 # This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
-uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
+uv run sphinx-autobuild source build/html --write-all
 ```

 ### Update API Documentation
@ -179,7 +185,7 @@ uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
 If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command:

 ```bash
-uv run ./docs/openapi_generator/run_openapi_generator.sh
+uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh
 ```

 The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,9 +1,8 @@
 include pyproject.toml
+include llama_stack/templates/dependencies.json
 include llama_stack/models/llama/llama3/tokenizer.model
-include llama_stack/models/llama/llama4/tokenizer.model
 include llama_stack/distribution/*.sh
 include llama_stack/cli/scripts/*.sh
 include llama_stack/templates/*/*.yaml
 include llama_stack/providers/tests/test_cases/inference/*.json
 include llama_stack/models/llama/*/*.md
-include llama_stack/tests/integration/*.jpg
--- a/README.md
+++ b/README.md
@ -3,82 +3,11 @@
 [![PyPI version](https://img.shields.io/pypi/v/llama_stack.svg)](https://pypi.org/project/llama_stack/)
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/)
 [![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE)
-[![Discord](https://img.shields.io/discord/1257833999603335178?color=6A7EC2&logo=discord&logoColor=ffffff)](https://discord.gg/llama-stack)
+[![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack)
 [![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
 [![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)

-[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
-
-### ✨🎉 Llama 4 Support  🎉✨
-We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
-
-<details>
-
-<summary>👋 Click here to see how to run Llama 4 models on Llama Stack </summary>
-
-\
-*Note you need 8xH100 GPU-host to run these models*
-
-```bash
-pip install -U llama_stack
-
-MODEL="Llama-4-Scout-17B-16E-Instruct"
-# get meta url from llama.com
-llama model download --source meta --model-id $MODEL --meta-url <META_URL>
-
-# start a llama stack server
-INFERENCE_MODEL=meta-llama/$MODEL llama stack build --run --template meta-reference-gpu
-
-# install client to interact with the server
-pip install llama-stack-client
-```
-### CLI
-```bash
-# Run a chat completion
-llama-stack-client --endpoint http://localhost:8321 \
-inference chat-completion \
--model-id meta-llama/$MODEL \
--message "write a haiku for meta's llama 4 models"
-
-ChatCompletionResponse(
-    completion_message=CompletionMessage(content="Whispers in code born\nLlama's gentle, wise heartbeat\nFuture's soft unfold", role='assistant', stop_reason='end_of_turn', tool_calls=[]),
-    logprobs=None,
-    metrics=[Metric(metric='prompt_tokens', value=21.0, unit=None), Metric(metric='completion_tokens', value=28.0, unit=None), Metric(metric='total_tokens', value=49.0, unit=None)]
-)
-```
-### Python SDK
-```python
-from llama_stack_client import LlamaStackClient
-
-client = LlamaStackClient(base_url=f"http://localhost:8321")
-
-model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-prompt = "Write a haiku about coding"
-
-print(f"User> {prompt}")
-response = client.inference.chat_completion(
-    model_id=model_id,
-    messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": prompt},
-    ],
-)
-print(f"Assistant> {response.completion_message.content}")
-```
-As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!
-
-
-</details>
-
-### 🚀 One-Line Installer 🚀
-
-To try Llama Stack locally, run:
-
-```bash
-curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/install.sh | sh
-```
-
-### Overview
+[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb)

 Llama Stack standardizes the core building blocks that simplify AI application development. It codifies best practices across the Llama ecosystem. More specifically, it provides

@ -107,29 +36,25 @@ By reducing friction and complexity, Llama Stack empowers developers to focus on
 ### API Providers
 Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack.

-| **API Provider Builder** |    **Environments**    | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | **Post Training** |
-|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|:-----------------:|
-|      Meta Reference      |      Single Node       |     ✅      |       ✅       |     ✅      |     ✅      |       ✅       |               |
-|        SambaNova         |         Hosted         |            |       ✅       |            |     ✅      |               |                  |
-|         Cerebras         |         Hosted         |            |       ✅       |            |            |               |                  |
-|        Fireworks         |         Hosted         |     ✅      |       ✅       |     ✅      |            |               |                |
-|       AWS Bedrock        |         Hosted         |            |       ✅       |            |     ✅      |               |                |
-|         Together         |         Hosted         |     ✅      |       ✅       |            |     ✅      |               |                |
-|           Groq           |         Hosted         |            |       ✅       |            |            |               |                 |
-|          Ollama          |      Single Node       |            |       ✅       |            |            |               |                 |
-|           TGI            | Hosted and Single Node |            |       ✅       |            |            |               |                 |
-|        NVIDIA NIM        | Hosted and Single Node |            |       ✅       |            |            |               |                 |
-|          Chroma          |      Single Node       |            |               |     ✅      |            |               |                 |
-|        PG Vector         |      Single Node       |            |               |     ✅      |            |               |                 |
-|    PyTorch ExecuTorch    |     On-device iOS      |     ✅      |       ✅       |            |            |               |                |
-|           vLLM           | Hosted and Single Node |            |       ✅       |            |            |               |                 |
-|          OpenAI          |         Hosted         |            |       ✅       |            |            |               |                 |
-|        Anthropic         |         Hosted         |            |       ✅       |            |            |               |                 |
-|          Gemini          |         Hosted         |            |       ✅       |            |            |               |                 |
-|          watsonx         |         Hosted         |            |       ✅       |            |            |               |                 |
-|        HuggingFace       |       Single Node      |            |                |            |            |               |       ✅        |
-|         TorchTune        |       Single Node      |            |                |            |            |               |       ✅        |
-|       NVIDIA NEMO        |         Hosted         |            |                |            |            |               |       ✅        |
+| **API Provider Builder** |    **Environments**    | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
+|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|
+|      Meta Reference      |      Single Node       |     ✅      |       ✅       |     ✅      |     ✅      |       ✅       |
+|        SambaNova         |         Hosted         |            |       ✅       |            |            |               |
+|         Cerebras         |         Hosted         |            |       ✅       |            |            |               |
+|        Fireworks         |         Hosted         |     ✅      |       ✅       |     ✅      |            |               |
+|       AWS Bedrock        |         Hosted         |            |       ✅       |            |     ✅      |               |
+|         Together         |         Hosted         |     ✅      |       ✅       |            |     ✅      |               |
+|           Groq           |         Hosted         |            |       ✅       |            |            |               |
+|          Ollama          |      Single Node       |            |       ✅       |            |            |               |
+|           TGI            | Hosted and Single Node |            |       ✅       |            |            |               |
+|        NVIDIA NIM        | Hosted and Single Node |            |       ✅       |            |            |               |
+|          Chroma          |      Single Node       |            |               |     ✅      |            |               |
+|        PG Vector         |      Single Node       |            |               |     ✅      |            |               |
+|    PyTorch ExecuTorch    |     On-device iOS      |     ✅      |       ✅       |            |            |               |
+|           vLLM           | Hosted and Single Node |            |       ✅       |            |            |               |
+|          OpenAI          |         Hosted         |            |       ✅       |            |            |               |
+|        Anthropic         |         Hosted         |            |       ✅       |            |            |               |
+|          Gemini          |         Hosted         |            |       ✅       |            |            |               |


 ### Distributions
@ -139,6 +64,7 @@ A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider
 |               **Distribution**                |                                                                    **Llama Stack Docker**                                                                     |                                                 Start This Distribution                                                  |
 |:---------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------:|
 |                Meta Reference                 |           [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general)           |      [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html)      |
+|           Meta Reference Quantized            | [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-quantized-gpu.html) |
 |                   SambaNova                   |                     [llamastack/distribution-sambanova](https://hub.docker.com/repository/docker/llamastack/distribution-sambanova/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/sambanova.html)   |
 |                   Cerebras                    |                     [llamastack/distribution-cerebras](https://hub.docker.com/repository/docker/llamastack/distribution-cerebras/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/cerebras.html)   |
 |                    Ollama                     |                       [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general)                       |            [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html)            |
--- a/docs/_static/css/my_theme.css
+++ b/docs/_static/css/my_theme.css
@ -16,20 +16,3 @@
 .hide-title h1 {
    display: none;
 }
-
-h2, h3, h4 {
-    font-weight: normal;
-}
-html[data-theme="dark"] .rst-content div[class^="highlight"] {
-  background-color: #0b0b0b;
-}
-pre {
-    white-space: pre-wrap !important;
-    word-break: break-all;
-}
-
-[data-theme="dark"] .mermaid {
-    background-color: #f4f4f6 !important;
-    border-radius: 6px;
-    padding: 0.5em;
-  }
--- a/docs/_static/js/detect_theme.js
+++ b/docs/_static/js/detect_theme.js
@ -1,32 +0,0 @@
-document.addEventListener("DOMContentLoaded", function () {
-  const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
-  const htmlElement = document.documentElement;
-
-  // Check if theme is saved in localStorage
-  const savedTheme = localStorage.getItem("sphinx-rtd-theme");
-
-  if (savedTheme) {
-    // Use the saved theme preference
-    htmlElement.setAttribute("data-theme", savedTheme);
-    document.body.classList.toggle("dark", savedTheme === "dark");
-  } else {
-    // Fall back to system preference
-    const theme = prefersDark ? "dark" : "light";
-    htmlElement.setAttribute("data-theme", theme);
-    document.body.classList.toggle("dark", theme === "dark");
-    // Save initial preference
-    localStorage.setItem("sphinx-rtd-theme", theme);
-  }
-
-  // Listen for theme changes from the existing toggle
-  const observer = new MutationObserver(function(mutations) {
-    mutations.forEach(function(mutation) {
-      if (mutation.attributeName === "data-theme") {
-        const currentTheme = htmlElement.getAttribute("data-theme");
-        localStorage.setItem("sphinx-rtd-theme", currentTheme);
-      }
-    });
-  });
-
-  observer.observe(htmlElement, { attributes: true });
-});
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
--- a/docs/getting_started_llama_api.ipynb
+++ b/docs/getting_started_llama_api.ipynb
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
--- a/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
+++ b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
@ -840,6 +840,7 @@
    "    \"memory_optimizations.rst\",\n",
    "    \"chat.rst\",\n",
    "    \"llama3.rst\",\n",
+    "    \"datasets.rst\",\n",
    "    \"qat_finetune.rst\",\n",
    "    \"lora_finetune.rst\",\n",
    "]\n",
@ -1585,6 +1586,7 @@
    "    \"memory_optimizations.rst\",\n",
    "    \"chat.rst\",\n",
    "    \"llama3.rst\",\n",
+    "    \"datasets.rst\",\n",
    "    \"qat_finetune.rst\",\n",
    "    \"lora_finetune.rst\",\n",
    "]\n",
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@ -44,14 +44,13 @@ def main(output_dir: str):
    if return_type_errors:
        print("\nAPI Method Return Type Validation Errors:\n")
        for error in return_type_errors:
-            print(error, file=sys.stderr)
+            print(error)
        sys.exit(1)
    now = str(datetime.now())
    print(
        "Converting the spec to YAML (openapi.yaml) and HTML (openapi.html) at " + now
    )
    print("")
-
    spec = Specification(
        LlamaStack,
        Options(
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -6,7 +6,6 @@

 import hashlib
 import ipaddress
-import types
 import typing
 from dataclasses import make_dataclass
 from typing import Any, Dict, Set, Union
@ -180,7 +179,7 @@ class ContentBuilder:
        "Creates the content subtree for a request or response."

        def is_iterator_type(t):
-            return "StreamChunk" in str(t) or "OpenAIResponseObjectStream" in str(t)
+            return "StreamChunk" in str(t)

        def get_media_type(t):
            if is_generic_list(t):
@ -190,7 +189,7 @@ class ContentBuilder:
            else:
                return "application/json"

-        if typing.get_origin(payload_type) in (typing.Union, types.UnionType):
+        if typing.get_origin(payload_type) is typing.Union:
            media_types = []
            item_types = []
            for x in typing.get_args(payload_type):
@ -520,7 +519,7 @@ class Generator:
        )

    def _build_extra_tag_groups(
-        self, extra_types: Dict[str, Dict[str, type]]
+        self, extra_types: Dict[str, List[type]]
    ) -> Dict[str, List[Tag]]:
        """
        Creates a dictionary of tag group captions as keys, and tag lists as values.
@ -533,8 +532,9 @@ class Generator:
        for category_name, category_items in extra_types.items():
            tag_list: List[Tag] = []

-            for name, extra_type in category_items.items():
-                schema = self.schema_builder.classdef_to_schema(extra_type)
+            for extra_type in category_items:
+                name = python_type_to_name(extra_type)
+                schema = self.schema_builder.classdef_to_named_schema(name, extra_type)
                tag_list.append(self._build_type_tag(name, schema))

            if tag_list:
@ -759,7 +759,7 @@ class Generator:
        )

        return Operation(
-            tags=[getattr(op.defining_class, "API_NAMESPACE", op.defining_class.__name__)],
+            tags=[op.defining_class.__name__],
            summary=None,
            # summary=doc_string.short_description,
            description=description,
@ -805,8 +805,6 @@ class Generator:
        operation_tags: List[Tag] = []
        for cls in endpoint_classes:
            doc_string = parse_type(cls)
-            if hasattr(cls, "API_NAMESPACE") and cls.API_NAMESPACE != cls.__name__:
-                continue
            operation_tags.append(
                Tag(
                    name=cls.__name__,
@ -865,7 +863,7 @@ class Generator:
        for caption, extra_tag_group in extra_tag_groups.items():
            tag_groups.append(
                TagGroup(
-                    name=caption,
+                    name=self.options.map(caption),
                    tags=sorted(tag.name for tag in extra_tag_group),
                )
            )
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@ -132,18 +132,7 @@ def _validate_api_method_return_type(method) -> str | None:

    return_type = hints['return']
    if is_optional_type(return_type):
-        return "returns Optional type where a return value is mandatory"
-
-
-def _validate_api_method_doesnt_return_list(method) -> str | None:
-    hints = get_type_hints(method)
-
-    if 'return' not in hints:
-        return "has no return type annotation"
-
-    return_type = hints['return']
-    if get_origin(return_type) is list:
-        return "returns a list where a PaginatedResponse or List*Response object is expected"
+        return "returns Optional type"


 def _validate_api_delete_method_returns_none(method) -> str | None:
@ -154,7 +143,7 @@ def _validate_api_delete_method_returns_none(method) -> str | None:

    return_type = hints['return']
    if return_type is not None and return_type is not type(None):
-        return "does not return None where None is mandatory"
+        return "does not return None"


 def _validate_list_parameters_contain_data(method) -> str | None:
@ -171,67 +160,16 @@ def _validate_list_parameters_contain_data(method) -> str | None:
        return

    if 'data' not in return_type.model_fields:
-        return "does not have a mandatory data attribute containing the list of objects"
+        return "does not have data attribute"


-def _validate_has_ellipsis(method) -> str | None:
-    source = inspect.getsource(method)
-    if "..." not in source and not "NotImplementedError" in source:
-        return "does not contain ellipsis (...) in its implementation"
-
-def _validate_has_return_in_docstring(method) -> str | None:
-    source = inspect.getsource(method)
-    return_type = method.__annotations__.get('return')
-    if return_type is not None and return_type != type(None) and ":returns:" not in source:
-        return "does not have a ':returns:' in its docstring"
-
-def _validate_has_params_in_docstring(method) -> str | None:
-    source = inspect.getsource(method)
-    sig = inspect.signature(method)
-    # Only check if the method has more than one parameter
-    if len(sig.parameters) > 1 and ":param" not in source:
-        return "does not have a ':param' in its docstring"
-
-def _validate_has_no_return_none_in_docstring(method) -> str | None:
-    source = inspect.getsource(method)
-    return_type = method.__annotations__.get('return')
-    if return_type is None and ":returns: None" in source:
-        return "has a ':returns: None' in its docstring which is redundant for None-returning functions"
-
-def _validate_docstring_lines_end_with_dot(method) -> str | None:
-    docstring = inspect.getdoc(method)
-    if docstring is None:
-        return None
-
-    lines = docstring.split('\n')
-    for line in lines:
-        line = line.strip()
-        if line and not any(line.endswith(char) for char in '.:{}[]()",'):
-            return f"docstring line '{line}' does not end with a valid character: . : {{ }} [ ] ( ) , \""
-
 _VALIDATORS = {
    "GET": [
        _validate_api_method_return_type,
        _validate_list_parameters_contain_data,
-        _validate_api_method_doesnt_return_list,
-        _validate_has_ellipsis,
-        _validate_has_return_in_docstring,
-        _validate_has_params_in_docstring,
-        _validate_docstring_lines_end_with_dot,
    ],
    "DELETE": [
        _validate_api_delete_method_returns_none,
-        _validate_has_ellipsis,
-        _validate_has_return_in_docstring,
-        _validate_has_params_in_docstring,
-        _validate_has_no_return_none_in_docstring
-    ],
-    "POST": [
-        _validate_has_ellipsis,
-        _validate_has_return_in_docstring,
-        _validate_has_params_in_docstring,
-        _validate_has_no_return_none_in_docstring,
-        _validate_docstring_lines_end_with_dot,
    ],
 }

--- a/docs/readme.md
+++ b/docs/readme.md
@ -2,14 +2,6 @@

 Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [ReadTheDocs page](https://llama-stack.readthedocs.io/en/latest/index.html).

-## Render locally
-
-From the llama-stack root directory, run the following command to render the docs locally:
-```bash
-uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
-```
-You can open up the docs in your browser at http://localhost:8000
-
 ## Content

 Try out Llama Stack's capabilities through our detailed Jupyter notebooks:
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -0,0 +1,14 @@
+sphinx==8.1.3
+myst-parser
+linkify
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+sphinx-rtd-theme>=1.0.0
+sphinx-pdj-theme
+sphinx-copybutton
+sphinx-tabs
+sphinx-design
+sphinxcontrib-openapi
+sphinxcontrib-redoc
+sphinxcontrib-mermaid
+sphinxcontrib-video
+tomli
--- a/docs/source/building_applications/agent.md
+++ b/docs/source/building_applications/agent.md
@ -1,9 +1,6 @@
-# Agents
+# Llama Stack Agent Framework

-An Agent in Llama Stack is a powerful abstraction that allows you to build complex AI applications.
-
-The Llama Stack agent framework is built on a modular architecture that allows for flexible and powerful AI
-applications. This document explains the key components and how they work together.
+The Llama Stack agent framework is built on a modular architecture that allows for flexible and powerful AI applications. This document explains the key components and how they work together.

 ## Core Concepts

--- a/docs/source/building_applications/agent_execution_loop.md
+++ b/docs/source/building_applications/agent_execution_loop.md
@ -1,10 +1,6 @@
 ## Agent Execution Loop

-Agents are the heart of Llama Stack applications. They combine inference, memory, safety, and tool usage into coherent
-workflows. At its core, an agent follows a sophisticated execution loop that enables multi-step reasoning, tool usage,
-and safety checks.
-
-### Steps in the Agent Workflow
+Agents are the heart of complex AI applications. They combine inference, memory, safety, and tool usage into coherent workflows. At its core, an agent follows a sophisticated execution loop that enables multi-step reasoning, tool usage, and safety checks.

 Each agent turn follows these key steps:

@ -68,10 +64,7 @@ sequenceDiagram
    S->>U: 5. Final Response
 ```

-Each step in this process can be monitored and controlled through configurations.
-
-### Agent Execution Loop Example
-Here's an example that demonstrates monitoring the agent's execution:
+Each step in this process can be monitored and controlled through configurations. Here's an example that demonstrates monitoring the agent's execution:

 ```python
 from llama_stack_client import LlamaStackClient, Agent, AgentEventLogger
--- a/docs/source/building_applications/index.md
+++ b/docs/source/building_applications/index.md
@ -1,4 +1,4 @@
-# Building AI Applications (Examples)
+# Building AI Applications

 Llama Stack provides all the building blocks needed to create sophisticated AI applications.

@ -8,9 +8,9 @@ The best way to get started is to look at this notebook which walks through the

 Here are some key topics that will help you build effective agents:

- **[RAG (Retrieval-Augmented Generation)](rag)**: Learn how to enhance your agents with external knowledge through retrieval mechanisms.
 - **[Agent](agent)**: Understand the components and design patterns of the Llama Stack agent framework.
 - **[Agent Execution Loop](agent_execution_loop)**: Understand how agents process information, make decisions, and execute actions in a continuous loop.
+- **[RAG (Retrieval-Augmented Generation)](rag)**: Learn how to enhance your agents with external knowledge through retrieval mechanisms.
 - **[Tools](tools)**: Extend your agents' capabilities by integrating with external tools and APIs.
 - **[Evals](evals)**: Evaluate your agents' effectiveness and identify areas for improvement.
 - **[Telemetry](telemetry)**: Monitor and analyze your agents' performance and behavior.
@ -20,11 +20,12 @@ Here are some key topics that will help you build effective agents:
 :hidden:
 :maxdepth: 1

-rag
 agent
 agent_execution_loop
+rag
 tools
-evals
 telemetry
+evals
+advanced_agent_patterns
 safety
 ```
--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@ -1,11 +1,11 @@
-## Retrieval Augmented Generation (RAG)
+## Using Retrieval Augmented Generation (RAG)

 RAG enables your applications to reference and recall information from previous interactions or external documents.

 Llama Stack organizes the APIs that enable RAG into three layers:
-1. The lowermost APIs deal with raw storage and retrieval. These include Vector IO, KeyValue IO (coming soon) and Relational IO (also coming soon.).
-2. The next is the "Rag Tool", a first-class tool as part of the [Tools API](tools.md) that allows you to ingest documents (from URLs, files, etc) with various chunking strategies and query them smartly.
-3. Finally, it all comes together with the top-level ["Agents" API](agent.md) that allows you to create agents that can use the tools to answer questions, perform tasks, and more.
+- the lowermost APIs deal with raw storage and retrieval. These include Vector IO, KeyValue IO (coming soon) and Relational IO (also coming soon.)
+- next is the "Rag Tool", a first-class tool as part of the Tools API that allows you to ingest documents (from URLs, files, etc) with various chunking strategies and query them smartly.
+- finally, it all comes together with the top-level "Agents" API that allows you to create agents that can use the tools to answer questions, perform tasks, and more.

 <img src="rag.png" alt="RAG System" width="50%">

@ -17,19 +17,14 @@ We may add more storage types like Graph IO in the future.

 ### Setting up Vector DBs

-For this guide, we will use [Ollama](https://ollama.com/) as the inference provider.
-Ollama is an LLM runtime that allows you to run Llama models locally.
-
 Here's how to set up a vector database for RAG:

 ```python
 # Create http client
-import os
 from llama_stack_client import LlamaStackClient

 client = LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")

-
 # Register a vector db
 vector_db_id = "my_documents"
 response = client.vector_dbs.register(
@ -38,53 +33,17 @@ response = client.vector_dbs.register(
    embedding_dimension=384,
    provider_id="faiss",
 )
-```

-### Ingesting Documents
-You can ingest documents into the vector database using two methods: directly inserting pre-chunked
-documents or using the RAG Tool.
-```python
 # You can insert a pre-chunked document directly into the vector db
 chunks = [
    {
+        "document_id": "doc1",
        "content": "Your document text here",
        "mime_type": "text/plain",
-        "metadata": {
-            "document_id": "doc1",
-            "author": "Jane Doe",
-        },
    },
 ]
 client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
-```

-#### Using Precomputed Embeddings
-If you decide to precompute embeddings for your documents, you can insert them directly into the vector database by
-including the embedding vectors in the chunk data. This is useful if you have a separate embedding service or if you
-want to customize the ingestion process.
-```python
-chunks_with_embeddings = [
-    {
-        "content": "First chunk of text",
-        "mime_type": "text/plain",
-        "embedding": [0.1, 0.2, 0.3, ...],  # Your precomputed embedding vector
-        "metadata": {"document_id": "doc1", "section": "introduction"},
-    },
-    {
-        "content": "Second chunk of text",
-        "mime_type": "text/plain",
-        "embedding": [0.2, 0.3, 0.4, ...],  # Your precomputed embedding vector
-        "metadata": {"document_id": "doc1", "section": "methodology"},
-    },
-]
-client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks_with_embeddings)
-```
-When providing precomputed embeddings, ensure the embedding dimension matches the embedding_dimension specified when
-registering the vector database.
-
-### Retrieval
-You can query the vector database to retrieve documents based on their embeddings.
-```python
 # You can then query for these chunks
 chunks_response = client.vector_io.query(
    vector_db_id=vector_db_id, query="What do you know about..."
@ -93,9 +52,7 @@ chunks_response = client.vector_io.query(

 ### Using the RAG Tool

-A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc.
-and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the
-[appendix](#more-ragdocument-examples).
+A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc. and automatically chunks them into smaller pieces.

 ```python
 from llama_stack_client import RAGDocument
@ -124,17 +81,6 @@ results = client.tool_runtime.rag_tool.query(
 )
 ```

-You can configure how the RAG tool adds metadata to the context if you find it useful for your application. Simply add:
-```python
-# Query documents
-results = client.tool_runtime.rag_tool.query(
-    vector_db_ids=[vector_db_id],
-    content="What do you know about...",
-    query_config={
-        "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
-    },
-)
-```
 ### Building RAG-Enhanced Agents

 One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
@ -152,12 +98,6 @@ agent = Agent(
            "name": "builtin::rag/knowledge_search",
            "args": {
                "vector_db_ids": [vector_db_id],
-                # Defaults
-                "query_config": {
-                    "chunk_size_in_tokens": 512,
-                    "chunk_overlap_in_tokens": 0,
-                    "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
-                },
            },
        }
    ],
@ -222,38 +162,3 @@ for vector_db_id in client.vector_dbs.list():
    print(f"Unregistering vector database: {vector_db_id.identifier}")
    client.vector_dbs.unregister(vector_db_id=vector_db_id.identifier)
 ```
-
-### Appendix
-
-#### More RAGDocument Examples
-```python
-from llama_stack_client import RAGDocument
-import base64
-
-RAGDocument(document_id="num-0", content={"uri": "file://path/to/file"})
-RAGDocument(document_id="num-1", content="plain text")
-RAGDocument(
-    document_id="num-2",
-    content={
-        "type": "text",
-        "text": "plain text input",
-    },  # for inputs that should be treated as text explicitly
-)
-RAGDocument(
-    document_id="num-3",
-    content={
-        "type": "image",
-        "image": {"url": {"uri": "https://mywebsite.com/image.jpg"}},
-    },
-)
-B64_ENCODED_IMAGE = base64.b64encode(
-    requests.get(
-        "https://raw.githubusercontent.com/meta-llama/llama-stack/refs/heads/main/docs/_static/llama-stack.png"
-    ).content
-)
-RAGDocuemnt(
-    document_id="num-4",
-    content={"type": "image", "image": {"data": B64_ENCODED_IMAGE}},
-)
-```
-for more strongly typed interaction use the typed dicts found [here](https://github.com/meta-llama/llama-stack-client-python/blob/38cd91c9e396f2be0bec1ee96a19771582ba6f17/src/llama_stack_client/types/shared_params/document.py).
--- a/docs/source/building_applications/telemetry.md
+++ b/docs/source/building_applications/telemetry.md
@ -45,16 +45,14 @@ Here's an example that sends telemetry signals to all three sink types. Your con
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      sinks: ['console', 'sqlite', 'otel_trace', 'otel_metric']
-      otel_trace_endpoint: "http://localhost:4318/v1/traces"
-      otel_metric_endpoint: "http://localhost:4318/v1/metrics"
+      sinks: ['console', 'sqlite', 'otel']
+      otel_endpoint: "http://localhost:4318/v1/traces"
      sqlite_db_path: "/path/to/telemetry.db"
 ```

 ### Jaeger to visualize traces

-The `otel` sink works with any service compatible with the OpenTelemetry collector, traces and metrics has two separate endpoints.
-Let's use Jaeger to visualize this data.
+The `otel` sink works with any service compatible with the OpenTelemetry collector. Let's use Jaeger to visualize this data.

 Start a Jaeger instance with the OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686 using the following command:

--- a/docs/source/building_applications/tools.md
+++ b/docs/source/building_applications/tools.md
@ -41,9 +41,30 @@ client.toolgroups.register(

 The tool requires an API key which can be provided either in the configuration or through the request header `X-LlamaStack-Provider-Data`. The format of the header is `{"<provider_name>_api_key": <your api key>}`.

-> **NOTE:** When using Tavily Search and Bing Search, the inference output will still display "Brave Search." This is because Llama models have been trained with Brave Search as a built-in tool. Tavily and bing is just being used in lieu of Brave search.


+#### Code Interpreter
+
+The Code Interpreter allows execution of Python code within a controlled environment.
+
+```python
+# Register Code Interpreter tool group
+client.toolgroups.register(
+    toolgroup_id="builtin::code_interpreter", provider_id="code_interpreter"
+)
+```
+
+Features:
+- Secure execution environment using `bwrap` sandboxing
+- Matplotlib support for generating plots
+- Disabled dangerous system operations
+- Configurable execution timeouts
+
+> ⚠️ Important: The code interpreter tool can operate in a controlled environment locally or on Podman containers. To ensure proper functionality in containerized environments:
+> - The container requires privileged access (e.g., --privileged).
+> - Users without sufficient permissions may encounter permission errors. (`bwrap: Can't mount devpts on /newroot/dev/pts: Permission denied`)
+> - 🔒 Security Warning: Privileged mode grants elevated access and bypasses security restrictions. Use only in local, isolated, or controlled environments.
+
 #### WolframAlpha

 The WolframAlpha tool provides access to computational knowledge through the WolframAlpha API.
@ -81,7 +102,7 @@ Features:
 - Context retrieval with token limits


-> **Note:** By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
+> **Note:** By default, llama stack run.yaml defines toolgroups for web search, code interpreter and rag, that are provided by tavily-search, code-interpreter and rag providers.

 ## Model Context Protocol (MCP) Tools

@ -165,69 +186,31 @@ all_tools = client.tools.list_tools()
 group_tools = client.tools.list_tools(toolgroup_id="search_tools")
 ```

-## Simple Example 2: Using an Agent with the Web Search Tool
-1. Start by registering a Tavily API key at [Tavily](https://tavily.com/).
-2. [Optional] Provide the API key directly to the Llama Stack server
-```bash
-export TAVILY_SEARCH_API_KEY="your key"
-```
-```bash
--env TAVILY_SEARCH_API_KEY=${TAVILY_SEARCH_API_KEY}
-```
-3. Run the following script.
+## Simple Example: Using an Agent with the Code-Interpreter Tool
+
 ```python
-from llama_stack_client.lib.agents.agent import Agent
-from llama_stack_client.types.agent_create_params import AgentConfig
-from llama_stack_client.lib.agents.event_logger import EventLogger
-from llama_stack_client import LlamaStackClient
-
-client = LlamaStackClient(
-    base_url=f"http://localhost:8321",
-    provider_data={
-        "tavily_search_api_key": "your_TAVILY_SEARCH_API_KEY"
-    },  # Set this from the client side. No need to provide it if it has already been configured on the Llama Stack server.
-)
+from llama_stack_client import Agent

+# Instantiate the AI agent with the given configuration
 agent = Agent(
    client,
+    name="code-interpreter",
+    description="A code interpreter agent for executing Python code snippets",
+    instructions="""
+    You are a highly reliable, concise, and precise assistant.
+    Always show the generated code, never generate your own code, and never anticipate results.
+    """,
    model="meta-llama/Llama-3.2-3B-Instruct",
-    instructions=(
-        "You are a web search assistant, must use websearch tool to look up the most current and precise information available. "
-    ),
-    tools=["builtin::websearch"],
+    tools=["builtin::code_interpreter"],
+    max_infer_iters=5,
 )

-session_id = agent.create_session("websearch-session")
+# Start a session
+session_id = agent.create_session("tool_session")

+# Send a query to the AI agent for code execution
 response = agent.create_turn(
-    messages=[
-        {"role": "user", "content": "How did the USA perform in the last Olympics?"}
-    ],
+    messages=[{"role": "user", "content": "Run this code: print(3 ** 4 - 5 * 2)"}],
    session_id=session_id,
 )
-for log in EventLogger().log(response):
-    log.print()
-```
-
-## Simple Example3: Using an Agent with the WolframAlpha Tool
-1. Start by registering for a WolframAlpha API key at [WolframAlpha Developer Portal](https://developer.wolframalpha.com/access).
-2. Provide the API key either when starting the Llama Stack server:
-    ```bash
-    --env WOLFRAM_ALPHA_API_KEY=${WOLFRAM_ALPHA_API_KEY}
-    ```
-    or from the client side:
-    ```python
-    client = LlamaStackClient(
-        base_url="http://localhost:8321",
-        provider_data={"wolfram_alpha_api_key": wolfram_api_key},
-    )
-    ```
-3. Configure the tools in the Agent by setting `tools=["builtin::wolfram_alpha"]`.
-4. Example user query:
-    ```python
-    response = agent.create_turn(
-        messages=[{"role": "user", "content": "Solve x^2 + 2x + 1 = 0 using WolframAlpha"}],
-        session_id=session_id,
-    )
-    ```
 ```
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -12,31 +12,23 @@
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information

-import json
-from datetime import datetime
-from pathlib import Path
-
-import requests
 from docutils import nodes
+from pathlib import Path
+import requests
+import json

 # Read version from pyproject.toml
 with Path(__file__).parent.parent.parent.joinpath("pyproject.toml").open("rb") as f:
    pypi_url = "https://pypi.org/pypi/llama-stack/json"
-    headers = {
-        'User-Agent': 'pip/23.0.1 (python 3.11)',  # Mimic pip's user agent
-        'Accept': 'application/json'
-    }
-    version_tag = json.loads(requests.get(pypi_url, headers=headers).text)["info"]["version"]
+    version_tag = json.loads(requests.get(pypi_url).text)["info"]["version"]
    print(f"{version_tag=}")

    # generate the full link including text and url here
-    llama_stack_version_url = (
-        f"https://github.com/meta-llama/llama-stack/releases/tag/v{version_tag}"
-    )
+    llama_stack_version_url = f"https://github.com/meta-llama/llama-stack/releases/tag/v{version_tag}"
    llama_stack_version_link = f"<a href='{llama_stack_version_url}'>release notes</a>"

 project = "llama-stack"
-copyright = f"{datetime.now().year}, Meta"
+copyright = "2025, Meta"
 author = "Meta"

 # -- General configuration ---------------------------------------------------
@ -44,11 +36,10 @@ author = "Meta"

 extensions = [
    "myst_parser",
-    "sphinx_copybutton",
-    "sphinx_design",
    "sphinx_rtd_theme",
-    "sphinx_rtd_dark_mode",
+    "sphinx_copybutton",
    "sphinx_tabs.tabs",
+    "sphinx_design",
    "sphinxcontrib.redoc",
    "sphinxcontrib.mermaid",
    "sphinxcontrib.video",
@ -57,6 +48,14 @@ myst_enable_extensions = ["colon_fence"]

 html_theme = "sphinx_rtd_theme"
 html_use_relative_paths = True
+
+# html_theme = "sphinx_pdj_theme"
+# html_theme_path = [sphinx_pdj_theme.get_html_theme_path()]
+
+# html_theme = "pytorch_sphinx_theme"
+# html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
+
+
 templates_path = ["_templates"]
 exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]

@ -84,7 +83,7 @@ myst_substitutions = {
    "llama_stack_version_link": llama_stack_version_link,
 }

-suppress_warnings = ["myst.header"]
+suppress_warnings = ['myst.header']

 # Copy button settings
 copybutton_prompt_text = "$ "  # for bash prompts
@ -104,23 +103,15 @@ source_suffix = {
 # html_theme = "alabaster"
 html_theme_options = {
    "canonical_url": "https://github.com/meta-llama/llama-stack",
-    "collapse_navigation": False,
    # "style_nav_header_background": "#c3c9d4",
-    'display_version': True,
-    'version_selector': True,
 }

-default_dark_mode = False
-
 html_static_path = ["../_static"]
 # html_logo = "../_static/llama-stack-logo.png"
 # html_style = "../_static/css/my_theme.css"

-
 def setup(app):
    app.add_css_file("css/my_theme.css")
-    app.add_js_file("js/detect_theme.js")
-
    def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
        url = f"https://hub.docker.com/r/llamastack/{text}"
        node = nodes.reference(rawtext, text, refuri=url, **options)
--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@ -1,14 +1,14 @@
+# Contributing to Llama Stack

-```{include} ../../../CONTRIBUTING.md
-```
-
-See the [Adding a New API Provider](new_api_provider.md) which describes how to add new API providers to the Stack.
-
+Start with the [Contributing Guide](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md) for some general tips. This section covers a few key topics in more detail.

+- [Adding a New API Provider](new_api_provider.md) describes adding new API providers to the Stack.
+- [Testing Llama Stack](testing.md) provides details about the testing framework and how to test providers and distributions.

 ```{toctree}
 :maxdepth: 1
 :hidden:

 new_api_provider
+testing
 ```
--- a/docs/source/contributing/new_api_provider.md
+++ b/docs/source/contributing/new_api_provider.md
@ -6,7 +6,7 @@ This guide will walk you through the process of adding a new API provider to Lla
 - Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
 - Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
 - Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
- Update any distribution {repopath}`Templates::llama_stack/templates/` `build.yaml` and `run.yaml` files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
+- Update any distribution {repopath}`Templates::llama_stack/templates/` build.yaml and run.yaml files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.


 Here are some example PRs to help you get started:
@ -33,7 +33,6 @@ Note that each provider's `sample_run_config()` method (in the configuration cla

 Unit tests are located in {repopath}`tests/unit`. Provider-specific unit tests are located in {repopath}`tests/unit/providers`. These tests are all run automatically as part of the CI process.

-Consult {repopath}`tests/unit/README.md` for more details on how to run the tests manually.

 ### 3. Additional end-to-end testing

--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -109,6 +109,8 @@ llama stack build --list-templates
 +------------------------------+-----------------------------------------------------------------------------+
 | nvidia                       | Use NVIDIA NIM for running LLM inference                                    |
 +------------------------------+-----------------------------------------------------------------------------+
+| meta-reference-quantized-gpu | Use Meta Reference with fp8, int4 quantization for running LLM inference    |
+------------------------------+-----------------------------------------------------------------------------+
 | cerebras                     | Use Cerebras for running LLM inference                                      |
 +------------------------------+-----------------------------------------------------------------------------+
 | ollama                       | Use (an external) Ollama server for running LLM inference                   |
@ -174,11 +176,7 @@ distribution_spec:
    safety: inline::llama-guard
    agents: inline::meta-reference
    telemetry: inline::meta-reference
-image_name: ollama
 image_type: conda
-
-# If some providers are external, you can specify the path to the implementation
-external_providers_dir: ~/.llama/providers.d
 ```

 ```
@ -186,57 +184,6 @@ llama stack build --config llama_stack/templates/ollama/build.yaml
 ```
 :::

-:::{tab-item} Building with External Providers
-
-Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently or use community-provided providers.
-
-To build a distribution with external providers, you need to:
-
-1. Configure the `external_providers_dir` in your build configuration file:
-
-```yaml
-# Example my-external-stack.yaml with external providers
-version: '2'
-distribution_spec:
-  description: Custom distro for CI tests
-  providers:
-    inference:
-    - remote::custom_ollama
-# Add more providers as needed
-image_type: container
-image_name: ci-test
-# Path to external provider implementations
-external_providers_dir: ~/.llama/providers.d
-```
-
-Here's an example for a custom Ollama provider:
-
-```yaml
-adapter:
-  adapter_type: custom_ollama
-  pip_packages:
-  - ollama
-  - aiohttp
-  - llama-stack-provider-ollama # This is the provider package
-  config_class: llama_stack_ollama_provider.config.OllamaImplConfig
-  module: llama_stack_ollama_provider
-api_dependencies: []
-optional_api_dependencies: []
-```
-
-The `pip_packages` section lists the Python packages required by the provider, as well as the
-provider package itself. The package must be available on PyPI or can be provided from a local
-directory or a git repository (git must be installed on the build environment).
-
-2. Build your distribution using the config file:
-
-```
-llama stack build --config my-external-stack.yaml
-```
-
-For more information on external providers, including directory structure, provider types, and implementation requirements, see the [External Providers documentation](../providers/external.md).
-:::
-
 :::{tab-item} Building Container

 ```{admonition} Podman Alternative
@ -271,7 +218,7 @@ Now, let's start the Llama Stack Distribution Server. You will need the YAML con

 ```
 llama stack run -h
-usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] [--tls-certfile TLS_CERTFILE]
+usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--disable-ipv6] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] [--tls-certfile TLS_CERTFILE]
                       [--image-type {conda,container,venv}]
                       config

@ -284,7 +231,8 @@ options:
  -h, --help            show this help message and exit
  --port PORT           Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
  --image-name IMAGE_NAME
-                        Name of the image to run. Defaults to the current environment (default: None)
+                        Name of the image to run. Defaults to the current conda environment (default: None)
+  --disable-ipv6        Disable IPv6 support (default: False)
  --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: [])
  --tls-keyfile TLS_KEYFILE
                        Path to TLS key file for HTTPS (default: None)
@ -338,48 +286,6 @@ INFO:     Application startup complete.
 INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
 INFO:     2401:db00:35c:2d2b:face:0:c9:0:54678 - "GET /models/list HTTP/1.1" 200 OK
 ```
-### Listing Distributions
-Using the list command, you can view all existing Llama Stack distributions, including stacks built from templates, from scratch, or using custom configuration files.
-
-```
-llama stack list -h
-usage: llama stack list [-h]
-
-list the build stacks
-
-options:
-  -h, --help  show this help message and exit
-```
-
-Example Usage
-
-```
-llama stack list
-```
-
-### Removing a Distribution
-Use the remove command to delete a distribution you've previously built.
-
-```
-llama stack rm -h
-usage: llama stack rm [-h] [--all] [name]
-
-Remove the build stack
-
-positional arguments:
-  name        Name of the stack to delete (default: None)
-
-options:
-  -h, --help  show this help message and exit
-  --all, -a   Delete all stacks (use with caution) (default: False)
-```
-
-Example
-```
-llama stack rm llamastack-test
-```
-
-To keep your environment organized and avoid clutter, consider using `llama stack list` to review old or unused distributions and `llama stack rm <name>` to delete them when they’re no longer needed.

 ### Troubleshooting

--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@ -1,8 +1,8 @@
-# Configuring a "Stack"
+# Configuring a Stack

 The Llama Stack runtime configuration is specified as a YAML file. Here is a simplified version of an example configuration file for the Ollama distribution:

-```{dropdown} 👋 Click here for a Sample Configuration File
+```{dropdown} Sample Configuration File

 ```yaml
 version: 2
@ -53,13 +53,6 @@ models:
  provider_id: ollama
  provider_model_id: null
 shields: []
-server:
-  port: 8321
-  auth:
-    provider_type: "kubernetes"
-    config:
-      api_server_url: "https://kubernetes.default.svc"
-      ca_cert_path: "/path/to/ca.crt"
 ```

 Let's break this down into the different sections. The first section specifies the set of APIs that the stack server will serve:
@ -109,227 +102,6 @@ A Model is an instance of a "Resource" (see [Concepts](../concepts/index)) and i

 What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. Contrast it with `model_id` which is the identifier for the same model for Llama Stack's purposes. For example, you may want to name "llama3.2:vision-11b" as "image_captioning_model" when you use it in your Stack interactions. When omitted, the server will set `provider_model_id` to be the same as `model_id`.

-## Server Configuration
-
-The `server` section configures the HTTP server that serves the Llama Stack APIs:
-
-```yaml
-server:
-  port: 8321  # Port to listen on (default: 8321)
-  tls_certfile: "/path/to/cert.pem"  # Optional: Path to TLS certificate for HTTPS
-  tls_keyfile: "/path/to/key.pem"    # Optional: Path to TLS key for HTTPS
-```
-
-### Authentication Configuration
-
-The `auth` section configures authentication for the server. When configured, all API requests must include a valid Bearer token in the Authorization header:
-
-```
-Authorization: Bearer <token>
-```
-
-The server supports multiple authentication providers:
-
-#### OAuth 2.0/OpenID Connect Provider with Kubernetes
-
-The Kubernetes cluster must be configured to use a service account for authentication.
-
-```bash
-kubectl create namespace llama-stack
-kubectl create serviceaccount llama-stack-auth -n llama-stack
-kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --serviceaccount=llama-stack:llama-stack-auth -n llama-stack
-kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
-```
-
-Make sure the `kube-apiserver` runs with `--anonymous-auth=true` to allow unauthenticated requests
-and that the correct RoleBinding is created to allow the service account to access the necessary
-resources. If that is not the case, you can create a RoleBinding for the service account to access
-the necessary resources:
-
-```yaml
-# allow-anonymous-openid.yaml
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRole
-metadata:
-  name: allow-anonymous-openid
-rules:
- nonResourceURLs: ["/openid/v1/jwks"]
-  verbs: ["get"]
---
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRoleBinding
-metadata:
-  name: allow-anonymous-openid
-roleRef:
-  apiGroup: rbac.authorization.k8s.io
-  kind: ClusterRole
-  name: allow-anonymous-openid
-subjects:
- kind: User
-  name: system:anonymous
-  apiGroup: rbac.authorization.k8s.io
-```
-
-And then apply the configuration:
-```bash
-kubectl apply -f allow-anonymous-openid.yaml
-```
-
-Validates tokens against the Kubernetes API server through the OIDC provider:
-```yaml
-server:
-  auth:
-    provider_type: "oauth2_token"
-    config:
-      jwks:
-        uri: "https://kubernetes.default.svc"
-        key_recheck_period: 3600
-      tls_cafile: "/path/to/ca.crt"
-      issuer: "https://kubernetes.default.svc"
-      audience: "https://kubernetes.default.svc"
-```
-
-To find your cluster's audience, run:
-```bash
-kubectl create token default --duration=1h | cut -d. -f2 | base64 -d | jq .aud
-```
-
-For the issuer, you can use the OIDC provider's URL:
-```bash
-kubectl get --raw /.well-known/openid-configuration| jq .issuer
-```
-
-For the tls_cafile, you can use the CA certificate of the OIDC provider:
-```bash
-kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}'
-```
-
-The provider extracts user information from the JWT token:
- Username from the `sub` claim becomes a role
- Kubernetes groups become teams
-
-You can easily validate a request by running:
-
-```bash
-curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers
-```
-
-#### Custom Provider
-Validates tokens against a custom authentication endpoint:
-```yaml
-server:
-  auth:
-    provider_type: "custom"
-    config:
-      endpoint: "https://auth.example.com/validate"  # URL of the auth endpoint
-```
-
-The custom endpoint receives a POST request with:
-```json
-{
-  "api_key": "<token>",
-  "request": {
-    "path": "/api/v1/endpoint",
-    "headers": {
-      "content-type": "application/json",
-      "user-agent": "curl/7.64.1"
-    },
-    "params": {
-      "key": ["value"]
-    }
-  }
-}
-```
-
-And must respond with:
-```json
-{
-  "access_attributes": {
-    "roles": ["admin", "user"],
-    "teams": ["ml-team", "nlp-team"],
-    "projects": ["llama-3", "project-x"],
-    "namespaces": ["research"]
-  },
-  "message": "Authentication successful"
-}
-```
-
-If no access attributes are returned, the token is used as a namespace.
-
-### Quota Configuration
-
-The `quota` section allows you to enable server-side request throttling for both
-authenticated and anonymous clients. This is useful for preventing abuse, enforcing
-fairness across tenants, and controlling infrastructure costs without requiring
-client-side rate limiting or external proxies.
-
-Quotas are disabled by default. When enabled, each client is tracked using either:
-
-* Their authenticated `client_id` (derived from the Bearer token), or
-* Their IP address (fallback for anonymous requests)
-
-Quota state is stored in a SQLite-backed key-value store, and rate limits are applied
-within a configurable time window (currently only `day` is supported).
-
-#### Example
-
-```yaml
-server:
-  quota:
-    kvstore:
-      type: sqlite
-      db_path: ./quotas.db
-    anonymous_max_requests: 100
-    authenticated_max_requests: 1000
-    period: day
-```
-
-#### Configuration Options
-
-| Field                        | Description                                                                |
-| ---------------------------- | -------------------------------------------------------------------------- |
-| `kvstore`                    | Required. Backend storage config for tracking request counts.              |
-| `kvstore.type`               | Must be `"sqlite"` for now. Other backends may be supported in the future. |
-| `kvstore.db_path`            | File path to the SQLite database.                                          |
-| `anonymous_max_requests`     | Max requests per period for unauthenticated clients.                       |
-| `authenticated_max_requests` | Max requests per period for authenticated clients.                         |
-| `period`                     | Time window for quota enforcement. Only `"day"` is supported.              |
-
-> Note: if `authenticated_max_requests` is set but no authentication provider is
-configured, the server will fall back to applying `anonymous_max_requests` to all
-clients.
-
-#### Example with Authentication Enabled
-
-```yaml
-server:
-  port: 8321
-  auth:
-    provider_type: custom
-    config:
-      endpoint: https://auth.example.com/validate
-  quota:
-    kvstore:
-      type: sqlite
-      db_path: ./quotas.db
-    anonymous_max_requests: 100
-    authenticated_max_requests: 1000
-    period: day
-```
-
-If a client exceeds their limit, the server responds with:
-
-```http
-HTTP/1.1 429 Too Many Requests
-Content-Type: application/json
-
-{
-  "error": {
-    "message": "Quota exceeded"
-  }
-}
-```
-
 ## Extending to handle Safety

 Configuring Safety can be a little involved so it is instructive to go through an example.
--- a/docs/source/distributions/importing_as_library.md
+++ b/docs/source/distributions/importing_as_library.md
@ -1,12 +1,10 @@
 # Using Llama Stack as a Library

-## Setup Llama Stack without a Server
-If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library.
-This avoids the overhead of setting up a server.
+If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library. This avoids the overhead of setting up a server.
 ```bash
 # setup
 uv pip install llama-stack
-llama stack build --template ollama --image-type venv
+llama stack build --template together --image-type venv
 ```

 ```python
@ -17,7 +15,7 @@ client = LlamaStackAsLibraryClient(
    # provider_data is optional, but if you need to pass in any provider specific data, you can do so here.
    provider_data={"tavily_search_api_key": os.environ["TAVILY_SEARCH_API_KEY"]},
 )
-client.initialize()
+await client.initialize()
 ```

 This will parse your config and set up any inline implementations and remote clients needed for your implementation.
--- a/docs/source/distributions/index.md
+++ b/docs/source/distributions/index.md
@ -1,18 +1,32 @@
-# Distributions Overview
+# Starting a Llama Stack Server

-A distribution is a pre-packaged set of Llama Stack components that can be deployed together.
+You can run a Llama Stack server in one of the following ways:
+
+**As a Library**:
+
+This is the simplest way to get started. Using Llama Stack as a library means you do not need to start a server. This is especially useful when you are not running inference locally and relying on an external inference service (eg. fireworks, together, groq, etc.) See [Using Llama Stack as a Library](importing_as_library)
+
+
+**Container**:
+
+Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
+
+
+**Conda**:
+
+If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
+
+
+**Kubernetes**:
+
+If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.

-This section provides an overview of the distributions available in Llama Stack.

 ```{toctree}
-:maxdepth: 3
+:maxdepth: 1
+:hidden:

 importing_as_library
 configuration
-list_of_distributions
 kubernetes_deployment
-building_distro
-on_device_distro
-remote_hosted_distro
-self_hosted_distro
 ```
--- a/docs/source/distributions/kubernetes_deployment.md
+++ b/docs/source/distributions/kubernetes_deployment.md
@ -1,24 +1,16 @@
 # Kubernetes Deployment Guide

-Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster.
-
-### Prerequisites
-In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.
+Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster. In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.

 First, create a local Kubernetes cluster via Kind:

-```
+```bash
 kind create cluster --image kindest/node:v1.32.0 --name llama-stack-test
 ```

-First set your hugging face token as an environment variable.
-```
-export HF_TOKEN=$(echo -n "your-hf-token" | base64)
-```
+First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:

-Now create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
-
-```
+```bash
 cat <<EOF |kubectl apply -f -
 apiVersion: v1
 kind: PersistentVolumeClaim
@ -38,14 +30,12 @@ metadata:
  name: hf-token-secret
 type: Opaque
 data:
-  token: $HF_TOKEN
-EOF
+  token: $(HF_TOKEN)
 ```

-
 Next, start the vLLM server as a Kubernetes Deployment and Service:

-```
+```bash
 cat <<EOF |kubectl apply -f -
 apiVersion: apps/v1
 kind: Deployment
@ -101,7 +91,7 @@ EOF

 We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):

-```
+```bash
 $ kubectl logs -l app.kubernetes.io/name=vllm
 ...
 INFO:     Started server process [1]
@ -125,8 +115,8 @@ providers:

 Once we have defined the run configuration for Llama Stack, we can build an image with that configuration and the server source code:

-```
-tmp_dir=$(mktemp -d) && cat >$tmp_dir/Containerfile.llama-stack-run-k8s <<EOF
+```bash
+cat >/tmp/test-vllm-llama-stack/Containerfile.llama-stack-run-k8s <<EOF
 FROM distribution-myenv:dev

 RUN apt-get update && apt-get install -y git
@ -134,14 +124,13 @@ RUN git clone https://github.com/meta-llama/llama-stack.git /app/llama-stack-sou

 ADD ./vllm-llama-stack-run-k8s.yaml /app/config.yaml
 EOF
-podman build -f $tmp_dir/Containerfile.llama-stack-run-k8s -t llama-stack-run-k8s $tmp_dir
+podman build -f /tmp/test-vllm-llama-stack/Containerfile.llama-stack-run-k8s -t llama-stack-run-k8s /tmp/test-vllm-llama-stack
 ```

-### Deploying Llama Stack Server in Kubernetes

 We can then start the Llama Stack server by deploying a Kubernetes Pod and Service:

-```
+```bash
 cat <<EOF |kubectl apply -f -
 apiVersion: v1
 kind: PersistentVolumeClaim
@ -172,7 +161,7 @@ spec:
      - name: llama-stack
        image: localhost/llama-stack-run-k8s:latest
        imagePullPolicy: IfNotPresent
-        command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/app/config.yaml"]
+        command: ["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]
        ports:
          - containerPort: 5000
        volumeMounts:
@ -198,10 +187,9 @@ spec:
 EOF
 ```

-### Verifying the Deployment
 We can check that the LlamaStack server has started:

-```
+```bash
 $ kubectl logs -l app.kubernetes.io/name=llama-stack
 ...
 INFO:     Started server process [1]
@ -213,7 +201,7 @@ INFO:     Uvicorn running on http://['::', '0.0.0.0']:5000 (Press CTRL+C to quit

 Finally, we forward the Kubernetes service to a local port and test some inference requests against it via the Llama Stack Client:

-```
+```bash
 kubectl port-forward service/llama-stack-service 5000:5000
 llama-stack-client --endpoint http://localhost:5000 inference chat-completion --message "hello, what model are you?"
 ```
--- a/docs/source/distributions/ondevice_distro/android_sdk.md
+++ b/docs/source/distributions/ondevice_distro/android_sdk.md
@ -24,7 +24,7 @@ The key files in the app are `ExampleLlamaStackLocalInference.kt`, `ExampleLlama
 Add the following dependency in your `build.gradle.kts` file:
 ```
 dependencies {
- implementation("com.llama.llamastack:llama-stack-client-kotlin:0.2.2")
+ implementation("com.llama.llamastack:llama-stack-client-kotlin:0.1.4.2")
 }
 ```
 This will download jar files in your gradle cache in a directory like `~/.gradle/caches/modules-2/files-2.1/com.llama.llamastack/`
@ -37,7 +37,11 @@ For local inferencing, it is required to include the ExecuTorch library into you

 Include the ExecuTorch library by:
 1. Download the `download-prebuilt-et-lib.sh` script file from the [llama-stack-client-kotlin-client-local](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) directory to your local machine.
-2. Move the script to the top level of your Android app where the `app` directory resides.
+2. Move the script to the top level of your Android app where the app directory resides:
+<p align="center">
+<img src="https://github.com/meta-llama/llama-stack-client-kotlin/blob/latest-release/doc/img/example_android_app_directory.png" style="width:300px">
+</p>
+
 3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate.
 4. Add the `executorch.aar` dependency in your `build.gradle.kts` file:
 ```
@ -48,8 +52,6 @@ dependencies {
 }
 ```

-See other dependencies for the local RAG in Android app [README](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/examples/android_app#quick-start).
-
 ## Llama Stack APIs in Your Android App
 Breaking down the demo app, this section will show the core pieces that are used to initialize and run inference with Llama Stack using the Kotlin library.

@ -58,7 +60,7 @@ Start a Llama Stack server on localhost. Here is an example of how you can do th
 ```
 conda create -n stack-fireworks python=3.10
 conda activate stack-fireworks
-pip install --no-cache llama-stack==0.2.2
+pip install --no-cache llama-stack==0.1.4
 llama stack build --template fireworks --image-type conda
 export FIREWORKS_API_KEY=<SOME_KEY>
 llama stack run fireworks --port 5050
--- a/docs/source/distributions/remote_hosted_distro/nvidia.md
+++ b/docs/source/distributions/remote_hosted_distro/nvidia.md
@ -0,0 +1,88 @@
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
+# NVIDIA Distribution
+
+The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.
+
+| API | Provider(s) |
+|-----|-------------|
+| agents | `inline::meta-reference` |
+| datasetio | `inline::localfs` |
+| eval | `inline::meta-reference` |
+| inference | `remote::nvidia` |
+| post_training | `remote::nvidia` |
+| safety | `remote::nvidia` |
+| scoring | `inline::basic` |
+| telemetry | `inline::meta-reference` |
+| tool_runtime | `inline::rag-runtime` |
+| vector_io | `inline::faiss` |
+
+
+### Environment Variables
+
+The following environment variables can be configured:
+
+- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
+- `NVIDIA_USER_ID`: NVIDIA User ID (default: `llama-stack-user`)
+- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
+- `NVIDIA_ACCESS_POLICIES`: NVIDIA Access Policies (default: `{}`)
+- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
+- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
+- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
+- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
+- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
+- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
+
+### Models
+
+The following models are available by default:
+
+- `meta/llama3-8b-instruct (aliases: meta-llama/Llama-3-8B-Instruct)`
+- `meta/llama3-70b-instruct (aliases: meta-llama/Llama-3-70B-Instruct)`
+- `meta/llama-3.1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
+- `meta/llama-3.1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
+- `meta/llama-3.1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
+- `meta/llama-3.2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
+- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
+- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
+- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
+- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
+- `nvidia/nv-embedqa-e5-v5 `
+- `nvidia/nv-embedqa-mistral-7b-v2 `
+- `snowflake/arctic-embed-l `
+
+
+### Prerequisite: API Keys
+
+Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
+
+
+## Running Llama Stack with NVIDIA
+
+You can do this via Conda (build code) or Docker which has a pre-built image.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
+```bash
+LLAMA_STACK_PORT=8321
+docker run \
+  -it \
+  --pull always \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ./run.yaml:/root/my-run.yaml \
+  llamastack/distribution-nvidia \
+  --yaml-config /root/my-run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
+```
+
+### Via Conda
+
+```bash
+llama stack build --template nvidia --image-type conda
+llama stack run ./run.yaml \
+  --port 8321 \
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
+  --env INFERENCE_MODEL=$INFERENCE_MODEL
+```
--- a/docs/source/distributions/remote_hosted_distro/watsonx.md
+++ b/docs/source/distributions/remote_hosted_distro/watsonx.md
@ -1,88 +0,0 @@
---
-orphan: true
---
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
-# watsonx Distribution
-
-```{toctree}
-:maxdepth: 2
-:hidden:
-
-self
-```
-
-The `llamastack/distribution-watsonx` distribution consists of the following provider configurations.
-
-| API | Provider(s) |
-|-----|-------------|
-| agents | `inline::meta-reference` |
-| datasetio | `remote::huggingface`, `inline::localfs` |
-| eval | `inline::meta-reference` |
-| inference | `remote::watsonx`, `inline::sentence-transformers` |
-| safety | `inline::llama-guard` |
-| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
-| telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
-| vector_io | `inline::faiss` |
-
-
-
-### Environment Variables
-
-The following environment variables can be configured:
-
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
- `WATSONX_API_KEY`: watsonx API Key (default: ``)
- `WATSONX_PROJECT_ID`: watsonx Project ID (default: ``)
-
-### Models
-
-The following models are available by default:
-
- `meta-llama/llama-3-3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
- `meta-llama/llama-2-13b-chat (aliases: meta-llama/Llama-2-13b)`
- `meta-llama/llama-3-1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
- `meta-llama/llama-3-1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `meta-llama/llama-3-2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
- `meta-llama/llama-3-2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
- `meta-llama/llama-3-2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `meta-llama/llama-3-2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
- `meta-llama/llama-guard-3-11b-vision (aliases: meta-llama/Llama-Guard-3-11B-Vision)`
-
-
-### Prerequisite: API Keys
-
-Make sure you have access to a watsonx API Key. You can get one by referring [watsonx.ai](https://www.ibm.com/docs/en/masv-and-l/maximo-manage/continuous-delivery?topic=setup-create-watsonx-api-key).
-
-
-## Running Llama Stack with watsonx
-
-You can do this via Conda (build code), venv or Docker which has a pre-built image.
-
-### Via Docker
-
-This method allows you to get started quickly without having to build the distribution code.
-
-```bash
-LLAMA_STACK_PORT=5001
-docker run \
-  -it \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
-  llamastack/distribution-watsonx \
-  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env WATSONX_API_KEY=$WATSONX_API_KEY \
-  --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
-  --env WATSONX_BASE_URL=$WATSONX_BASE_URL
-```
-
-### Via Conda
-
-```bash
-llama stack build --template watsonx --image-type conda
-llama stack run ./run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env WATSONX_API_KEY=$WATSONX_API_KEY \
-  --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID
-```
--- a/docs/source/distributions/list_of_distributions.md
+++ b/docs/source/distributions/list_of_distributions.md
@ -1,4 +1,4 @@
-# Available List of Distributions
+# List of Distributions

 Here are a list of distributions you can use to start a Llama Stack server that are provided out of the box.

--- a/docs/source/distributions/self_hosted_distro/bedrock.md
+++ b/docs/source/distributions/self_hosted_distro/bedrock.md
@ -19,7 +19,7 @@ The `llamastack/distribution-bedrock` distribution consists of the following pro
 | safety | `remote::bedrock` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


--- a/docs/source/distributions/self_hosted_distro/cerebras.md
+++ b/docs/source/distributions/self_hosted_distro/cerebras.md
@ -12,7 +12,7 @@ The `llamastack/distribution-cerebras` distribution consists of the following pr
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


@ -52,7 +52,7 @@ docker run \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-cerebras \
-  --config /root/my-run.yaml \
+  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY
 ```
--- a/docs/source/distributions/self_hosted_distro/dell.md
+++ b/docs/source/distributions/self_hosted_distro/dell.md
@ -23,7 +23,7 @@ The `llamastack/distribution-dell` distribution consists of the following provid
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


@ -155,7 +155,7 @@ docker run \
  -v $HOME/.llama:/root/.llama \
  -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-dell \
-  --config /root/my-run.yaml \
+  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env DEH_URL=$DEH_URL \
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@ -22,7 +22,7 @@ The `llamastack/distribution-fireworks` distribution consists of the following p
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


@ -46,8 +46,6 @@ The following models are available by default:
 - `accounts/fireworks/models/llama-v3p3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
 - `accounts/fireworks/models/llama-guard-3-8b (aliases: meta-llama/Llama-Guard-3-8B)`
 - `accounts/fireworks/models/llama-guard-3-11b-vision (aliases: meta-llama/Llama-Guard-3-11B-Vision)`
- `accounts/fireworks/models/llama4-scout-instruct-basic (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
- `accounts/fireworks/models/llama4-maverick-instruct-basic (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)`
 - `nomic-ai/nomic-embed-text-v1.5 `


--- a/docs/source/distributions/self_hosted_distro/groq.md
+++ b/docs/source/distributions/self_hosted_distro/groq.md
@ -22,7 +22,7 @@ The `llamastack/distribution-groq` distribution consists of the following provid
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
 | vector_io | `inline::faiss` |


@ -42,10 +42,6 @@ The following models are available by default:
 - `groq/llama3-70b-8192 (aliases: meta-llama/Llama-3-70B-Instruct)`
 - `groq/llama-3.3-70b-versatile (aliases: meta-llama/Llama-3.3-70B-Instruct)`
 - `groq/llama-3.2-3b-preview (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `groq/llama-4-scout-17b-16e-instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
- `groq/meta-llama/llama-4-scout-17b-16e-instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
- `groq/llama-4-maverick-17b-128e-instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)`
- `groq/meta-llama/llama-4-maverick-17b-128e-instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)`


 ### Prerequisite: API Keys
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@ -22,7 +22,7 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


@ -81,7 +81,6 @@ LLAMA_STACK_PORT=8321
 docker run \
  -it \
  --pull always \
-  --gpu all \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-meta-reference-gpu \
@ -95,7 +94,6 @@ If you are using Llama Stack Safety / Shield APIs, use:
 docker run \
  -it \
  --pull always \
-  --gpu all \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-meta-reference-gpu \
--- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
@ -0,0 +1,123 @@
+---
+orphan: true
+---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
+# Meta Reference Quantized Distribution
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
+The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists of the following provider configurations:
+
+| API | Provider(s) |
+|-----|-------------|
+| agents | `inline::meta-reference` |
+| datasetio | `remote::huggingface`, `inline::localfs` |
+| eval | `inline::meta-reference` |
+| inference | `inline::meta-reference-quantized` |
+| safety | `inline::llama-guard` |
+| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
+| telemetry | `inline::meta-reference` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
+
+
+The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc.
+
+Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
+
+### Environment Variables
+
+The following environment variables can be configured:
+
+- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
+- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
+- `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)
+
+
+## Prerequisite: Downloading Models
+
+Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
+
+```
+$ llama model list --downloaded
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
+┃ Model                                   ┃ Size     ┃ Modified Time       ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
+│ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
+└─────────────────────────────────────────┴──────────┴─────────────────────┘
+```
+
+## Running the Distribution
+
+You can do this via Conda (build code) or Docker which has a pre-built image.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
+```bash
+LLAMA_STACK_PORT=8321
+docker run \
+  -it \
+  --pull always \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  llamastack/distribution-meta-reference-quantized-gpu \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+```
+
+If you are using Llama Stack Safety / Shield APIs, use:
+
+```bash
+docker run \
+  -it \
+  --pull always \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  llamastack/distribution-meta-reference-quantized-gpu \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+```
+
+### Via Conda
+
+Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
+
+```bash
+llama stack build --template meta-reference-quantized-gpu --image-type conda
+llama stack run distributions/meta-reference-quantized-gpu/run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+```
+
+If you are using Llama Stack Safety / Shield APIs, use:
+
+```bash
+llama stack run distributions/meta-reference-quantized-gpu/run-with-safety.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+```
--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@ -1,4 +1,3 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # NVIDIA Distribution

 The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.
@ -6,130 +5,34 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
-| datasetio | `inline::localfs`, `remote::nvidia` |
-| eval | `remote::nvidia` |
 | inference | `remote::nvidia` |
-| post_training | `remote::nvidia` |
-| safety | `remote::nvidia` |
-| scoring | `inline::basic` |
+| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
+| safety | `inline::llama-guard` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `inline::rag-runtime` |
-| vector_io | `inline::faiss` |


 ### Environment Variables

 The following environment variables can be configured:

+- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
 - `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
- `NVIDIA_APPEND_API_VERSION`: Whether to append the API version to the base_url (default: `True`)
- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
- `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`)
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)

 ### Models

 The following models are available by default:

- `meta/llama3-8b-instruct (aliases: meta-llama/Llama-3-8B-Instruct)`
- `meta/llama3-70b-instruct (aliases: meta-llama/Llama-3-70B-Instruct)`
- `meta/llama-3.1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `meta/llama-3.1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
- `meta/llama-3.1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
- `meta/llama-3.2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
- `meta/llama-3.3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
- `nvidia/nv-embedqa-e5-v5 `
- `nvidia/nv-embedqa-mistral-7b-v2 `
- `snowflake/arctic-embed-l `
+- `${env.INFERENCE_MODEL} (None)`


-## Prerequisites
-### NVIDIA API Keys
+### Prerequisite: API Keys

-Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
+Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).

-### Deploy NeMo Microservices Platform
-The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform.
-
-## Supported Services
-Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
-
-### Inference: NVIDIA NIM
-NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
-  1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
-  2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
-
-The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
-
-### Datasetio API: NeMo Data Store
-The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
-
-See the [NVIDIA Datasetio docs](/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
-
-### Eval API: NeMo Evaluator
-The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
-
-See the [NVIDIA Eval docs](/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
-
-### Post-Training API: NeMo Customizer
-The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
-
-See the [NVIDIA Post-Training docs](/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
-
-### Safety API: NeMo Guardrails
-The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
-
-See the NVIDIA Safety docs for supported features and example usage.
-
-## Deploying models
-In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
-
-Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
-```sh
-# URL to NeMo NIM Proxy service
-export NEMO_URL="http://nemo.test"
-
-curl --location "$NEMO_URL/v1/deployment/model-deployments" \
-   -H 'accept: application/json' \
-   -H 'Content-Type: application/json' \
-   -d '{
-      "name": "llama-3.2-1b-instruct",
-      "namespace": "meta",
-      "config": {
-         "model": "meta/llama-3.2-1b-instruct",
-         "nim_deployment": {
-            "image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
-            "image_tag": "1.8.3",
-            "pvc_size": "25Gi",
-            "gpu": 1,
-            "additional_envs": {
-               "NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
-            }
-         }
-      }
-   }'
-```
-This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference.
-
-You can also remove a deployed NIM to free up GPU resources, if needed.
-```sh
-export NEMO_URL="http://nemo.test"
-
-curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
-```

 ## Running Llama Stack with NVIDIA

-You can do this via Conda or venv (build code), or Docker which has a pre-built image.
+You can do this via Conda (build code) or Docker which has a pre-built image.

 ### Via Docker

@ -143,7 +46,7 @@ docker run \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-nvidia \
-  --config /root/my-run.yaml \
+  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
 ```
@ -151,23 +54,8 @@ docker run \
 ### Via Conda

 ```bash
-INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
 llama stack build --template nvidia --image-type conda
 llama stack run ./run.yaml \
  --port 8321 \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
-```
-
-### Via venv
-
-If you've set up your local development environment, you can also build the image using your local virtual environment.
-
-```bash
-INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
-llama stack build --template nvidia --image-type venv
-llama stack run ./run.yaml \
-  --port 8321 \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
 ```
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@ -19,11 +19,10 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::ollama` |
-| post_training | `inline::huggingface` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


@ -98,7 +97,7 @@ docker run \
  -v ~/.llama:/root/.llama \
  -v ./llama_stack/templates/ollama/run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-ollama \
-  --config /root/my-run.yaml \
+  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env SAFETY_MODEL=$SAFETY_MODEL \
--- a/docs/source/distributions/self_hosted_distro/passthrough.md
+++ b/docs/source/distributions/self_hosted_distro/passthrough.md
@ -22,7 +22,7 @@ The `llamastack/distribution-passthrough` distribution consists of the following
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@ -21,11 +21,11 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


-You can use this distribution if you want to run an independent vLLM server for inference.
+You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.

 ### Environment Variables

@ -41,83 +41,6 @@ The following environment variables can be configured:

 ## Setting up vLLM server

-In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM
-server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
-[supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
-that we only use GPUs here for demonstration purposes. Note that if you run into issues, you can include the environment variable `--env VLLM_DEBUG_LOG_API_SERVER_RESPONSE=true` (available in vLLM v0.8.3 and above) in the `docker run` command to enable log response from API server for debugging.
-
-### Setting up vLLM server on AMD GPU
-
-AMD provides two main vLLM container options:
- rocm/vllm: Production-ready container
- rocm/vllm-dev: Development container with the latest vLLM features
-
-Please check the [Blog about ROCm vLLM Usage](https://rocm.blogs.amd.com/software-tools-optimization/vllm-container/README.html) to get more details.
-
-Here is a sample script to start a ROCm vLLM server locally via Docker:
-
-```bash
-export INFERENCE_PORT=8000
-export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
-export CUDA_VISIBLE_DEVICES=0
-export VLLM_DIMG="rocm/vllm-dev:main"
-
-docker run \
-    --pull always \
-    --ipc=host \
-    --privileged \
-    --shm-size 16g \
-    --device=/dev/kfd \
-    --device=/dev/dri \
-    --group-add video \
-    --cap-add=SYS_PTRACE \
-    --cap-add=CAP_SYS_ADMIN \
-    --security-opt seccomp=unconfined \
-    --security-opt apparmor=unconfined \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-    --env "HIP_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \
-    -p $INFERENCE_PORT:$INFERENCE_PORT \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    $VLLM_DIMG \
-    python -m vllm.entrypoints.openai.api_server \
-    --model $INFERENCE_MODEL \
-    --port $INFERENCE_PORT
-```
-
-Note that you'll also need to set `--enable-auto-tool-choice` and `--tool-call-parser` to [enable tool calling in vLLM](https://docs.vllm.ai/en/latest/features/tool_calling.html).
-
-If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
-
-```bash
-export SAFETY_PORT=8081
-export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-export CUDA_VISIBLE_DEVICES=1
-export VLLM_DIMG="rocm/vllm-dev:main"
-
-docker run \
-    --pull always \
-    --ipc=host \
-    --privileged \
-    --shm-size 16g \
-    --device=/dev/kfd \
-    --device=/dev/dri \
-    --group-add video \
-    --cap-add=SYS_PTRACE \
-    --cap-add=CAP_SYS_ADMIN \
-    --security-opt seccomp=unconfined \
-    --security-opt apparmor=unconfined \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-    --env "HIP_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \
-    -p $SAFETY_PORT:$SAFETY_PORT \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    $VLLM_DIMG \
-    python -m vllm.entrypoints.openai.api_server \
-    --model $SAFETY_MODEL \
-    --port $SAFETY_PORT
-```
-
-### Setting up vLLM server on NVIDIA GPU
-
 Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) to get a vLLM endpoint. Here is a sample script to start a vLLM server locally via Docker:

 ```bash
@ -162,55 +85,6 @@ docker run \
    --port $SAFETY_PORT
 ```

-### Setting up vLLM server on Intel GPU
-
-Refer to [vLLM Documentation for XPU](https://docs.vllm.ai/en/v0.8.2/getting_started/installation/gpu.html?device=xpu) to get a vLLM endpoint. In addition to vLLM side setup which guides towards installing vLLM from sources orself-building vLLM Docker container, Intel provides prebuilt vLLM container to use on systems with Intel GPUs supported by PyTorch XPU backend:
- [intel/vllm](https://hub.docker.com/r/intel/vllm)
-
-Here is a sample script to start a vLLM server locally via Docker using Intel provided container:
-
-```bash
-export INFERENCE_PORT=8000
-export INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct
-export ZE_AFFINITY_MASK=0
-
-docker run \
-    --pull always \
-    --device /dev/dri \
-    -v /dev/dri/by-path:/dev/dri/by-path \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-    --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
-    -p $INFERENCE_PORT:$INFERENCE_PORT \
-    --ipc=host \
-    intel/vllm:xpu \
-    --gpu-memory-utilization 0.7 \
-    --model $INFERENCE_MODEL \
-    --port $INFERENCE_PORT
-```
-
-If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
-
-```bash
-export SAFETY_PORT=8081
-export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-export ZE_AFFINITY_MASK=1
-
-docker run \
-    --pull always \
-    --device /dev/dri \
-    -v /dev/dri/by-path:/dev/dri/by-path \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-    --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
-    -p $SAFETY_PORT:$SAFETY_PORT \
-    --ipc=host \
-    intel/vllm:xpu \
-    --gpu-memory-utilization 0.7 \
-    --model $SAFETY_MODEL \
-    --port $SAFETY_PORT
-```
-
 ## Running Llama Stack

 Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
@ -233,7 +107,7 @@ docker run \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./llama_stack/templates/remote-vllm/run.yaml:/root/my-run.yaml \
  llamastack/distribution-remote-vllm \
-  --config /root/my-run.yaml \
+  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1
@ -255,7 +129,7 @@ docker run \
  -v ~/.llama:/root/.llama \
  -v ./llama_stack/templates/remote-vllm/run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-remote-vllm \
-  --config /root/my-run.yaml \
+  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1 \
--- a/docs/source/distributions/self_hosted_distro/sambanova.md
+++ b/docs/source/distributions/self_hosted_distro/sambanova.md
@ -16,10 +16,10 @@ The `llamastack/distribution-sambanova` distribution consists of the following p
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
-| inference | `remote::sambanova`, `inline::sentence-transformers` |
-| safety | `remote::sambanova` |
+| inference | `remote::sambanova` |
+| safety | `inline::llama-guard` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


@ -28,64 +28,52 @@ The `llamastack/distribution-sambanova` distribution consists of the following p
 The following environment variables can be configured:

 - `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
- `SAMBANOVA_API_KEY`: SambaNova API Key (default: ``)
+- `SAMBANOVA_API_KEY`: SambaNova.AI API Key (default: ``)

 ### Models

 The following models are available by default:

- `sambanova/Meta-Llama-3.1-8B-Instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `sambanova/Meta-Llama-3.1-405B-Instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
- `sambanova/Meta-Llama-3.2-1B-Instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
- `sambanova/Meta-Llama-3.2-3B-Instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `sambanova/Meta-Llama-3.3-70B-Instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
- `sambanova/Llama-3.2-11B-Vision-Instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
- `sambanova/Llama-3.2-90B-Vision-Instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
- `sambanova/Llama-4-Scout-17B-16E-Instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
- `sambanova/Llama-4-Maverick-17B-128E-Instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)`
- `sambanova/Meta-Llama-Guard-3-8B (aliases: meta-llama/Llama-Guard-3-8B)`
+- `Meta-Llama-3.1-8B-Instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
+- `Meta-Llama-3.1-70B-Instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
+- `Meta-Llama-3.1-405B-Instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
+- `Meta-Llama-3.2-1B-Instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
+- `Meta-Llama-3.2-3B-Instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
+- `Meta-Llama-3.3-70B-Instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
+- `Llama-3.2-11B-Vision-Instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
+- `Llama-3.2-90B-Vision-Instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
+- `Meta-Llama-Guard-3-8B (aliases: meta-llama/Llama-Guard-3-8B)`


 ### Prerequisite: API Keys

-Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaNova.ai](http://cloud.sambanova.ai?utm_source=llamastack&utm_medium=external&utm_campaign=cloud_signup).
+Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaNova.ai](https://sambanova.ai/).


 ## Running Llama Stack with SambaNova

 You can do this via Conda (build code) or Docker which has a pre-built image.

-
 ### Via Docker

+This method allows you to get started quickly without having to build the distribution code.
+
 ```bash
 LLAMA_STACK_PORT=8321
-llama stack build --template sambanova --image-type container
 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ~/.llama:/root/.llama \
-  distribution-sambanova \
+  llamastack/distribution-sambanova \
  --port $LLAMA_STACK_PORT \
  --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
 ```

-
-### Via Venv
-
-```bash
-llama stack build --template sambanova --image-type venv
-llama stack run --image-type venv ~/.llama/distributions/sambanova/sambanova-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
-```
-
-
 ### Via Conda

 ```bash
 llama stack build --template sambanova --image-type conda
-llama stack run --image-type conda ~/.llama/distributions/sambanova/sambanova-run.yaml \
+llama stack run ./run.yaml \
  --port $LLAMA_STACK_PORT \
  --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
 ```
--- a/docs/source/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/distributions/self_hosted_distro/tgi.md
@ -23,7 +23,7 @@ The `llamastack/distribution-tgi` distribution consists of the following provide
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


@ -117,7 +117,7 @@ docker run \
  -v ~/.llama:/root/.llama \
  -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-tgi \
-  --config /root/my-run.yaml \
+  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT \
--- a/docs/source/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@ -22,7 +22,7 @@ The `llamastack/distribution-together` distribution consists of the following pr
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


@ -48,8 +48,6 @@ The following models are available by default:
 - `meta-llama/Llama-Guard-3-11B-Vision-Turbo (aliases: meta-llama/Llama-Guard-3-11B-Vision)`
 - `togethercomputer/m2-bert-80M-8k-retrieval `
 - `togethercomputer/m2-bert-80M-32k-retrieval `
- `meta-llama/Llama-4-Scout-17B-16E-Instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct, together/meta-llama/Llama-4-Scout-17B-16E-Instruct)`
- `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct, together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8)`


 ### Prerequisite: API Keys
--- a/docs/source/distributions/starting_llama_stack_server.md
+++ b/docs/source/distributions/starting_llama_stack_server.md
@ -1,32 +0,0 @@
-# Starting a Llama Stack Server
-
-You can run a Llama Stack server in one of the following ways:
-
-## As a Library:
-
-This is the simplest way to get started. Using Llama Stack as a library means you do not need to start a server. This is especially useful when you are not running inference locally and relying on an external inference service (eg. fireworks, together, groq, etc.) See [Using Llama Stack as a Library](importing_as_library)
-
-
-## Container:
-
-Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
-
-
-## Conda:
-
-If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
-
-
-## Kubernetes:
-
-If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
-
-
-```{toctree}
-:maxdepth: 1
-:hidden:
-
-importing_as_library
-configuration
-kubernetes_deployment
-```
--- a/docs/source/getting_started/detailed_tutorial.md
+++ b/docs/source/getting_started/detailed_tutorial.md
@ -1,539 +0,0 @@
-# Detailed Tutorial
-
-In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to test a simple agent.
-A Llama Stack agent is a simple integrated system that can perform tasks by combining a Llama model for reasoning with
-tools (e.g., RAG, web search, code execution, etc.) for taking actions.
-In Llama Stack, we provide a server exposing multiple APIs. These APIs are backed by implementations from different providers.
-
-Llama Stack is a stateful service with REST APIs to support seamless transition of AI applications across different environments. The server can be run in a variety of ways, including as a standalone binary, Docker container, or hosted service. You can build and test using a local server first and deploy to a hosted endpoint for production.
-
-In this guide, we'll walk through how to build a RAG agent locally using Llama Stack with [Ollama](https://ollama.com/)
-as the inference [provider](../providers/index.md#inference) for a Llama Model.
-
-## Step 1: Installation and Setup
-
-Install Ollama by following the instructions on the [Ollama website](https://ollama.com/download), then
-download Llama 3.2 3B model, and then start the Ollama service.
-```bash
-ollama pull llama3.2:3b
-ollama run llama3.2:3b --keepalive 60m
-```
-
-Install [uv](https://docs.astral.sh/uv/) to setup your virtual environment
-
-::::{tab-set}
-
-:::{tab-item} macOS and Linux
-Use `curl` to download the script and execute it with `sh`:
-```console
-curl -LsSf https://astral.sh/uv/install.sh | sh
-```
-:::
-
-:::{tab-item} Windows
-Use `irm` to download the script and execute it with `iex`:
-
-```console
-powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
-```
-:::
-::::
-
-Setup your virtual environment.
-
-```bash
-uv sync --python 3.10
-source .venv/bin/activate
-```
-## Step 2:  Run Llama Stack
-Llama Stack is a server that exposes multiple APIs, you connect with it using the Llama Stack client SDK.
-
-::::{tab-set}
-
-:::{tab-item} Using `venv`
-You can use Python to build and run the Llama Stack server, which is useful for testing and development.
-
-Llama Stack uses a [YAML configuration file](../distributions/configuration.md) to specify the stack setup,
-which defines the providers and their settings.
-Now let's build and run the Llama Stack config for Ollama.
-
-```bash
-INFERENCE_MODEL=llama3.2:3b llama stack build --template ollama --image-type venv --run
-```
-:::
-:::{tab-item} Using `conda`
-You can use Python to build and run the Llama Stack server, which is useful for testing and development.
-
-Llama Stack uses a [YAML configuration file](../distributions/configuration.md) to specify the stack setup,
-which defines the providers and their settings.
-Now let's build and run the Llama Stack config for Ollama.
-
-```bash
-INFERENCE_MODEL=llama3.2:3b llama stack build --template ollama --image-type conda  --image-name llama3-3b-conda --run
-```
-:::
-:::{tab-item} Using a Container
-You can use a container image to run the Llama Stack server. We provide several container images for the server
-component that works with different inference providers out of the box. For this guide, we will use
-`llamastack/distribution-ollama` as the container image. If you'd like to build your own image or customize the
-configurations, please check out [this guide](../references/index.md).
-First lets setup some environment variables and create a local directory to mount into the container’s file system.
-```bash
-export INFERENCE_MODEL="llama3.2:3b"
-export LLAMA_STACK_PORT=8321
-mkdir -p ~/.llama
-```
-Then start the server using the container tool of your choice.  For example, if you are running Docker you can use the
-following command:
-```bash
-docker run -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ~/.llama:/root/.llama \
-  llamastack/distribution-ollama \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env OLLAMA_URL=http://host.docker.internal:11434
-```
-Note to start the container with Podman, you can do the same but replace `docker` at the start of the command with
-`podman`. If you are using `podman` older than `4.7.0`, please also replace `host.docker.internal` in the `OLLAMA_URL`
-with `host.containers.internal`.
-
-The configuration YAML for the Ollama distribution is available at `distributions/ollama/run.yaml`.
-
-```{tip}
-
-Docker containers run in their own isolated network namespaces on Linux. To allow the container to communicate with services running on the host via `localhost`, you need `--network=host`. This makes the container use the host’s network directly so it can connect to Ollama running on `localhost:11434`.
-
-Linux users having issues running the above command should instead try the following:
-```bash
-docker run -it \
-  --pull always \
-  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ~/.llama:/root/.llama \
-  --network=host \
-  llamastack/distribution-ollama \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env OLLAMA_URL=http://localhost:11434
-```
-:::
-::::
-You will see output like below:
-```
-INFO:     Application startup complete.
-INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
-```
-
-Now you can use the Llama Stack client to run inference and build agents!
-
-You can reuse the server setup or use the [Llama Stack Client](https://github.com/meta-llama/llama-stack-client-python/).
-Note that the client package is already included in the `llama-stack` package.
-
-## Step 3: Run Client CLI
-
-Open a new terminal and navigate to the same directory you started the server from. Then set up a new or activate your
-existing server virtual environment.
-
-::::{tab-set}
-
-:::{tab-item} Reuse Server `venv`
-```bash
-# The client is included in the llama-stack package so we just activate the server venv
-source .venv/bin/activate
-```
-:::
-
-:::{tab-item} Install with `venv`
-```bash
-uv venv client --python 3.10
-source client/bin/activate
-pip install llama-stack-client
-```
-:::
-
-:::{tab-item} Install with `conda`
-```bash
-yes | conda create -n stack-client python=3.10
-conda activate stack-client
-pip install llama-stack-client
-```
-:::
-::::
-
-Now let's use the `llama-stack-client` [CLI](../references/llama_stack_client_cli_reference.md) to check the
-connectivity to the server.
-
-```bash
-llama-stack-client configure --endpoint http://localhost:8321 --api-key none
-```
-You will see the below:
-```
-Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:8321
-```
-
-List the models
-```bash
-llama-stack-client models list
-Available Models
-
-┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
-┃ model_type      ┃ identifier                          ┃ provider_resource_id                ┃ metadata                                  ┃ provider_id     ┃
-┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
-│ embedding       │ all-MiniLM-L6-v2                    │ all-minilm:latest                   │ {'embedding_dimension': 384.0}            │ ollama          │
-├─────────────────┼─────────────────────────────────────┼─────────────────────────────────────┼───────────────────────────────────────────┼─────────────────┤
-│ llm             │ llama3.2:3b                         │ llama3.2:3b                         │                                           │ ollama          │
-└─────────────────┴─────────────────────────────────────┴─────────────────────────────────────┴───────────────────────────────────────────┴─────────────────┘
-
-Total models: 2
-
-```
-You can test basic Llama inference completion using the CLI.
-
-```bash
-llama-stack-client inference chat-completion --message "tell me a joke"
-```
-Sample output:
-```python
-ChatCompletionResponse(
-    completion_message=CompletionMessage(
-        content="Here's one:\n\nWhat do you call a fake noodle?\n\nAn impasta!",
-        role="assistant",
-        stop_reason="end_of_turn",
-        tool_calls=[],
-    ),
-    logprobs=None,
-    metrics=[
-        Metric(metric="prompt_tokens", value=14.0, unit=None),
-        Metric(metric="completion_tokens", value=27.0, unit=None),
-        Metric(metric="total_tokens", value=41.0, unit=None),
-    ],
-)
-```
-
-## Step 4: Run the Demos
-
-Note that these demos show the [Python Client SDK](../references/python_sdk_reference/index.md).
-Other SDKs are also available, please refer to the [Client SDK](../index.md#client-sdks) list for the complete options.
-
-::::{tab-set}
-
-:::{tab-item} Basic Inference
-Now you can run inference using the Llama Stack client SDK.
-
-### i. Create the Script
-
-Create a file `inference.py` and add the following code:
-```python
-from llama_stack_client import LlamaStackClient
-
-client = LlamaStackClient(base_url="http://localhost:8321")
-
-# List available models
-models = client.models.list()
-
-# Select the first LLM
-llm = next(m for m in models if m.model_type == "llm")
-model_id = llm.identifier
-
-print("Model:", model_id)
-
-response = client.inference.chat_completion(
-    model_id=model_id,
-    messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "Write a haiku about coding"},
-    ],
-)
-print(response.completion_message.content)
-```
-
-### ii. Run the Script
-Let's run the script using `uv`
-```bash
-uv run python inference.py
-```
-Which will output:
-```
-Model: llama3.2:3b
-Here is a haiku about coding:
-
-Lines of code unfold
-Logic flows through digital night
-Beauty in the bits
-```
-:::
-
-:::{tab-item} Build a Simple Agent
-Next we can move beyond simple inference and build an agent that can perform tasks using the Llama Stack server.
-### i. Create the Script
-Create a file `agent.py` and add the following code:
-
-```python
-from llama_stack_client import LlamaStackClient
-from llama_stack_client import Agent, AgentEventLogger
-from rich.pretty import pprint
-import uuid
-
-client = LlamaStackClient(base_url=f"http://localhost:8321")
-
-models = client.models.list()
-llm = next(m for m in models if m.model_type == "llm")
-model_id = llm.identifier
-
-agent = Agent(client, model=model_id, instructions="You are a helpful assistant.")
-
-s_id = agent.create_session(session_name=f"s{uuid.uuid4().hex}")
-
-print("Non-streaming ...")
-response = agent.create_turn(
-    messages=[{"role": "user", "content": "Who are you?"}],
-    session_id=s_id,
-    stream=False,
-)
-print("agent>", response.output_message.content)
-
-print("Streaming ...")
-stream = agent.create_turn(
-    messages=[{"role": "user", "content": "Who are you?"}], session_id=s_id, stream=True
-)
-for event in stream:
-    pprint(event)
-
-print("Streaming with print helper...")
-stream = agent.create_turn(
-    messages=[{"role": "user", "content": "Who are you?"}], session_id=s_id, stream=True
-)
-for event in AgentEventLogger().log(stream):
-    event.print()
-```
-### ii. Run the Script
-Let's run the script using `uv`
-```bash
-uv run python agent.py
-```
-
-```{dropdown} 👋 Click here to see the sample output
-    Non-streaming ...
-    agent> I'm an artificial intelligence designed to assist and communicate with users like you. I don't have a personal identity, but I'm here to provide information, answer questions, and help with tasks to the best of my abilities.
-
-    I can be used for a wide range of purposes, such as:
-
-    * Providing definitions and explanations
-    * Offering suggestions and ideas
-    * Helping with language translation
-    * Assisting with writing and proofreading
-    * Generating text or responses to questions
-    * Playing simple games or chatting about topics of interest
-
-    I'm constantly learning and improving my abilities, so feel free to ask me anything, and I'll do my best to help!
-
-    Streaming ...
-    AgentTurnResponseStreamChunk(
-    │   event=TurnResponseEvent(
-    │   │   payload=AgentTurnResponseStepStartPayload(
-    │   │   │   event_type='step_start',
-    │   │   │   step_id='69831607-fa75-424a-949b-e2049e3129d1',
-    │   │   │   step_type='inference',
-    │   │   │   metadata={}
-    │   │   )
-    │   )
-    )
-    AgentTurnResponseStreamChunk(
-    │   event=TurnResponseEvent(
-    │   │   payload=AgentTurnResponseStepProgressPayload(
-    │   │   │   delta=TextDelta(text='As', type='text'),
-    │   │   │   event_type='step_progress',
-    │   │   │   step_id='69831607-fa75-424a-949b-e2049e3129d1',
-    │   │   │   step_type='inference'
-    │   │   )
-    │   )
-    )
-    AgentTurnResponseStreamChunk(
-    │   event=TurnResponseEvent(
-    │   │   payload=AgentTurnResponseStepProgressPayload(
-    │   │   │   delta=TextDelta(text=' a', type='text'),
-    │   │   │   event_type='step_progress',
-    │   │   │   step_id='69831607-fa75-424a-949b-e2049e3129d1',
-    │   │   │   step_type='inference'
-    │   │   )
-    │   )
-    )
-    ...
-    AgentTurnResponseStreamChunk(
-    │   event=TurnResponseEvent(
-    │   │   payload=AgentTurnResponseStepCompletePayload(
-    │   │   │   event_type='step_complete',
-    │   │   │   step_details=InferenceStep(
-    │   │   │   │   api_model_response=CompletionMessage(
-    │   │   │   │   │   content='As a conversational AI, I don\'t have a personal identity in the classical sense. I exist as a program running on computer servers, designed to process and respond to text-based inputs.\n\nI\'m an instance of a type of artificial intelligence called a "language model," which is trained on vast amounts of text data to generate human-like responses. My primary function is to understand and respond to natural language inputs, like our conversation right now.\n\nThink of me as a virtual assistant, a chatbot, or a conversational interface – I\'m here to provide information, answer questions, and engage in conversation to the best of my abilities. I don\'t have feelings, emotions, or consciousness like humans do, but I\'m designed to simulate human-like interactions to make our conversations feel more natural and helpful.\n\nSo, that\'s me in a nutshell! What can I help you with today?',
-    │   │   │   │   │   role='assistant',
-    │   │   │   │   │   stop_reason='end_of_turn',
-    │   │   │   │   │   tool_calls=[]
-    │   │   │   │   ),
-    │   │   │   │   step_id='69831607-fa75-424a-949b-e2049e3129d1',
-    │   │   │   │   step_type='inference',
-    │   │   │   │   turn_id='8b360202-f7cb-4786-baa9-166a1b46e2ca',
-    │   │   │   │   completed_at=datetime.datetime(2025, 4, 3, 1, 15, 21, 716174, tzinfo=TzInfo(UTC)),
-    │   │   │   │   started_at=datetime.datetime(2025, 4, 3, 1, 15, 14, 28823, tzinfo=TzInfo(UTC))
-    │   │   │   ),
-    │   │   │   step_id='69831607-fa75-424a-949b-e2049e3129d1',
-    │   │   │   step_type='inference'
-    │   │   )
-    │   )
-    )
-    AgentTurnResponseStreamChunk(
-    │   event=TurnResponseEvent(
-    │   │   payload=AgentTurnResponseTurnCompletePayload(
-    │   │   │   event_type='turn_complete',
-    │   │   │   turn=Turn(
-    │   │   │   │   input_messages=[UserMessage(content='Who are you?', role='user', context=None)],
-    │   │   │   │   output_message=CompletionMessage(
-    │   │   │   │   │   content='As a conversational AI, I don\'t have a personal identity in the classical sense. I exist as a program running on computer servers, designed to process and respond to text-based inputs.\n\nI\'m an instance of a type of artificial intelligence called a "language model," which is trained on vast amounts of text data to generate human-like responses. My primary function is to understand and respond to natural language inputs, like our conversation right now.\n\nThink of me as a virtual assistant, a chatbot, or a conversational interface – I\'m here to provide information, answer questions, and engage in conversation to the best of my abilities. I don\'t have feelings, emotions, or consciousness like humans do, but I\'m designed to simulate human-like interactions to make our conversations feel more natural and helpful.\n\nSo, that\'s me in a nutshell! What can I help you with today?',
-    │   │   │   │   │   role='assistant',
-    │   │   │   │   │   stop_reason='end_of_turn',
-    │   │   │   │   │   tool_calls=[]
-    │   │   │   │   ),
-    │   │   │   │   session_id='abd4afea-4324-43f4-9513-cfe3970d92e8',
-    │   │   │   │   started_at=datetime.datetime(2025, 4, 3, 1, 15, 14, 28722, tzinfo=TzInfo(UTC)),
-    │   │   │   │   steps=[
-    │   │   │   │   │   InferenceStep(
-    │   │   │   │   │   │   api_model_response=CompletionMessage(
-    │   │   │   │   │   │   │   content='As a conversational AI, I don\'t have a personal identity in the classical sense. I exist as a program running on computer servers, designed to process and respond to text-based inputs.\n\nI\'m an instance of a type of artificial intelligence called a "language model," which is trained on vast amounts of text data to generate human-like responses. My primary function is to understand and respond to natural language inputs, like our conversation right now.\n\nThink of me as a virtual assistant, a chatbot, or a conversational interface – I\'m here to provide information, answer questions, and engage in conversation to the best of my abilities. I don\'t have feelings, emotions, or consciousness like humans do, but I\'m designed to simulate human-like interactions to make our conversations feel more natural and helpful.\n\nSo, that\'s me in a nutshell! What can I help you with today?',
-    │   │   │   │   │   │   │   role='assistant',
-    │   │   │   │   │   │   │   stop_reason='end_of_turn',
-    │   │   │   │   │   │   │   tool_calls=[]
-    │   │   │   │   │   │   ),
-    │   │   │   │   │   │   step_id='69831607-fa75-424a-949b-e2049e3129d1',
-    │   │   │   │   │   │   step_type='inference',
-    │   │   │   │   │   │   turn_id='8b360202-f7cb-4786-baa9-166a1b46e2ca',
-    │   │   │   │   │   │   completed_at=datetime.datetime(2025, 4, 3, 1, 15, 21, 716174, tzinfo=TzInfo(UTC)),
-    │   │   │   │   │   │   started_at=datetime.datetime(2025, 4, 3, 1, 15, 14, 28823, tzinfo=TzInfo(UTC))
-    │   │   │   │   │   )
-    │   │   │   │   ],
-    │   │   │   │   turn_id='8b360202-f7cb-4786-baa9-166a1b46e2ca',
-    │   │   │   │   completed_at=datetime.datetime(2025, 4, 3, 1, 15, 21, 727364, tzinfo=TzInfo(UTC)),
-    │   │   │   │   output_attachments=[]
-    │   │   │   )
-    │   │   )
-    │   )
-    )
-
-
-    Streaming with print helper...
-    inference> Déjà vu!
-
-    As I mentioned earlier, I'm an artificial intelligence language model. I don't have a personal identity or consciousness like humans do. I exist solely to process and respond to text-based inputs, providing information and assistance on a wide range of topics.
-
-    I'm a computer program designed to simulate human-like conversations, using natural language processing (NLP) and machine learning algorithms to understand and generate responses. My purpose is to help users like you with their questions, provide information, and engage in conversation.
-
-    Think of me as a virtual companion, a helpful tool designed to make your interactions more efficient and enjoyable. I don't have personal opinions, emotions, or biases, but I'm here to provide accurate and informative responses to the best of my abilities.
-
-    So, who am I? I'm just a computer program designed to help you!
-```
-:::
-
-:::{tab-item} Build a RAG Agent
-
-For our last demo, we can build a RAG agent that can answer questions about the Torchtune project using the documents
-in a vector database.
-### i. Create the Script
-Create a file `rag_agent.py` and add the following code:
-
-```python
-from llama_stack_client import LlamaStackClient
-from llama_stack_client import Agent, AgentEventLogger
-from llama_stack_client.types import Document
-import uuid
-
-client = LlamaStackClient(base_url="http://localhost:8321")
-
-# Create a vector database instance
-embed_lm = next(m for m in client.models.list() if m.model_type == "embedding")
-embedding_model = embed_lm.identifier
-vector_db_id = f"v{uuid.uuid4().hex}"
-client.vector_dbs.register(
-    vector_db_id=vector_db_id,
-    embedding_model=embedding_model,
-)
-
-# Create Documents
-urls = [
-    "memory_optimizations.rst",
-    "chat.rst",
-    "llama3.rst",
-    "qat_finetune.rst",
-    "lora_finetune.rst",
-]
-documents = [
-    Document(
-        document_id=f"num-{i}",
-        content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
-        mime_type="text/plain",
-        metadata={},
-    )
-    for i, url in enumerate(urls)
-]
-
-# Insert documents
-client.tool_runtime.rag_tool.insert(
-    documents=documents,
-    vector_db_id=vector_db_id,
-    chunk_size_in_tokens=512,
-)
-
-# Get the model being served
-llm = next(m for m in client.models.list() if m.model_type == "llm")
-model = llm.identifier
-
-# Create the RAG agent
-rag_agent = Agent(
-    client,
-    model=model,
-    instructions="You are a helpful assistant. Use the RAG tool to answer questions as needed.",
-    tools=[
-        {
-            "name": "builtin::rag/knowledge_search",
-            "args": {"vector_db_ids": [vector_db_id]},
-        }
-    ],
-)
-
-session_id = rag_agent.create_session(session_name=f"s{uuid.uuid4().hex}")
-
-turns = ["what is torchtune", "tell me about dora"]
-
-for t in turns:
-    print("user>", t)
-    stream = rag_agent.create_turn(
-        messages=[{"role": "user", "content": t}], session_id=session_id, stream=True
-    )
-    for event in AgentEventLogger().log(stream):
-        event.print()
-```
-### ii. Run the Script
-Let's run the script using `uv`
-```bash
-uv run python rag_agent.py
-```
-
-```{dropdown} 👋 Click here to see the sample output
-    user> what is torchtune
-    inference> [knowledge_search(query='TorchTune')]
-    tool_execution> Tool:knowledge_search Args:{'query': 'TorchTune'}
-    tool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text='Result 1:\nDocument_id:num-1\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. ..., type='text'), TextContentItem(text='END of knowledge_search tool results.\n', type='text')]
-    inference> Here is a high-level overview of the text:
-
-    **LoRA Finetuning with PyTorch Tune**
-
-    PyTorch Tune provides a recipe for LoRA (Low-Rank Adaptation) finetuning, which is a technique to adapt pre-trained models to new tasks. The recipe uses the `lora_finetune_distributed` command.
-    ...
-    Overall, DORA is a powerful reinforcement learning algorithm that can learn complex tasks from human demonstrations. However, it requires careful consideration of the challenges and limitations to achieve optimal results.
-```
-:::
-
-::::
-
-**You're Ready to Build Your Own Apps!**
-
-Congrats! 🥳 Now you're ready to [build your own Llama Stack applications](../building_applications/index)! 🚀
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@ -1,121 +1,303 @@
-# Quickstart
+# Quick Start

-Get started with Llama Stack in minutes!
+In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to test a simple RAG agent.

-Llama Stack is a stateful service with REST APIs to support the seamless transition of AI applications across different
-environments. You can build and test using a local server first and deploy to a hosted endpoint for production.
+A Llama Stack agent is a simple integrated system that can perform tasks by combining a Llama model for reasoning with tools (e.g., RAG, web search, code execution, etc.) for taking actions.

-In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)
-as the inference [provider](../providers/index.md#inference) for a Llama Model.
+In Llama Stack, we provide a server exposing multiple APIs. These APIs are backed by implementations from different providers. For this guide, we will use [Ollama](https://ollama.com/) as the inference provider.
+
+
+### 1. Start Ollama

-#### Step 1: Install and setup
-1. Install [uv](https://docs.astral.sh/uv/)
-2. Run inference on a Llama model with [Ollama](https://ollama.com/download)
 ```bash
-ollama run llama3.2:3b --keepalive 60m
+ollama run llama3.2:3b-instruct-fp16 --keepalive 60m
 ```
-#### Step 2: Run the Llama Stack server
-We will use `uv` to run the Llama Stack server.
+
+By default, Ollama keeps the model loaded in memory for 5 minutes which can be too short. We set the `--keepalive` flag to 60 minutes to ensure the model remains loaded for sometime.
+
+```{admonition} Note
+:class: tip
+
+If you do not have ollama, you can install it from [here](https://ollama.com/download).
+```
+
+
+### 2. Pick a client environment
+
+Llama Stack has a service-oriented architecture, so every interaction with the Stack happens through an REST interface. You can interact with the Stack in two ways:
+
+* Install the `llama-stack-client` PyPI package and point `LlamaStackClient` to a local or remote Llama Stack server.
+* Or, install the `llama-stack` PyPI package and use the Stack as a library using `LlamaStackAsLibraryClient`.
+
+```{admonition} Note
+:class: tip
+
+The API is **exactly identical** for both clients.
+```
+
+:::{dropdown} Starting up the Llama Stack server
+The Llama Stack server can be configured flexibly so you can mix-and-match various providers for its individual API components -- beyond Inference, these include Vector IO, Agents, Telemetry, Evals, Post Training, etc.
+
+To get started quickly, we provide various container images for the server component that work with different inference providers out of the box. For this guide, we will use `llamastack/distribution-ollama` as the container image. If you'd like to build your own image or customize the configurations, please check out [this guide](../references/index.md).
+
+Lets setup some environment variables that we will use in the rest of the guide.
 ```bash
-INFERENCE_MODEL=llama3.2:3b uv run --with llama-stack llama stack build --template ollama --image-type venv --run
+export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
+export LLAMA_STACK_PORT=8321
 ```
-#### Step 3: Run the demo
-Now open up a new terminal and copy the following script into a file named `demo_script.py`.
+
+Next you can create a local directory to mount into the container’s file system.
+```bash
+mkdir -p ~/.llama
+```
+
+Then you can start the server using the container tool of your choice.  For example, if you are running Docker you can use the following command:
+```bash
+docker run -it \
+  --pull always \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  llamastack/distribution-ollama \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env OLLAMA_URL=http://host.docker.internal:11434
+```
+
+As another example, to start the container with Podman, you can do the same but replace `docker` at the start of the command with `podman`. If you are using `podman` older than `4.7.0`, please also replace `host.docker.internal` in the `OLLAMA_URL` with `host.containers.internal`.
+
+Configuration for this is available at `distributions/ollama/run.yaml`.
+
+```{admonition} Note
+:class: note
+
+Docker containers run in their own isolated network namespaces on Linux. To allow the container to communicate with services running on the host via `localhost`, you need `--network=host`. This makes the container use the host’s network directly so it can connect to Ollama running on `localhost:11434`.
+
+Linux users having issues running the above command should instead try the following:
+```bash
+docker run -it \
+  --pull always \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  --network=host \
+  llamastack/distribution-ollama \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env OLLAMA_URL=http://localhost:11434
+```
+
+:::
+
+
+:::{dropdown} Installing the Llama Stack client CLI and SDK
+
+You can interact with the Llama Stack server using various client SDKs.  Note that you must be using Python 3.10 or newer. We will use the Python SDK which you can install via `conda` or `virtualenv`.
+
+For `conda`:
+```bash
+yes | conda create -n stack-client python=3.10
+conda activate stack-client
+pip install llama-stack-client
+```
+
+For `virtualenv`:
+```bash
+python -m venv stack-client
+source stack-client/bin/activate
+pip install llama-stack-client
+```
+
+Let's use the `llama-stack-client` CLI to check the connectivity to the server.
+
+```bash
+$ llama-stack-client configure --endpoint http://localhost:$LLAMA_STACK_PORT
+> Enter the API key (leave empty if no key is needed):
+Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:8321
+
+$ llama-stack-client models list
+
+Available Models
+
+┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━┓
+┃ model_type   ┃ identifier                           ┃ provider_resource_id         ┃ metadata  ┃ provider_id ┃
+┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━┩
+│ llm          │ meta-llama/Llama-3.2-3B-Instruct     │ llama3.2:3b-instruct-fp16    │           │ ollama      │
+└──────────────┴──────────────────────────────────────┴──────────────────────────────┴───────────┴─────────────┘
+
+Total models: 1
+```
+
+You can test basic Llama inference completion using the CLI too.
+```bash
+llama-stack-client \
+  inference chat-completion \
+  --message "hello, what model are you?"
+```
+:::
+
+&nbsp;
+
+### 3. Run inference with Python SDK
+
+Here is a simple example to perform chat completions using the SDK.
+```python
+import os
+import sys
+
+
+def create_http_client():
+    from llama_stack_client import LlamaStackClient
+
+    return LlamaStackClient(
+        base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}"
+    )
+
+
+def create_library_client(template="ollama"):
+    from llama_stack import LlamaStackAsLibraryClient
+
+    client = LlamaStackAsLibraryClient(template)
+    if not client.initialize():
+        print("llama stack not built properly")
+        sys.exit(1)
+    return client
+
+
+client = (
+    create_library_client()
+)  # or create_http_client() depending on the environment you picked
+
+# List available models
+models = client.models.list()
+print("--- Available models: ---")
+for m in models:
+    print(f"- {m.identifier}")
+print()
+
+response = client.inference.chat_completion(
+    model_id=os.environ["INFERENCE_MODEL"],
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Write a haiku about coding"},
+    ],
+)
+print(response.completion_message.content)
+```
+
+To run the above example, put the code in a file called `inference.py`, ensure your `conda` or `virtualenv` environment is active, and run the following:
+```bash
+pip install llama_stack
+llama stack build --template ollama --image-type <conda|venv>
+python inference.py
+```
+
+### 4. Your first RAG agent
+
+Here is an example of a simple RAG (Retrieval Augmented Generation) chatbot agent which can answer questions about TorchTune documentation.

 ```python
-from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient
+import os
+import uuid
+from termcolor import cprint

-vector_db_id = "my_demo_vector_db"
-client = LlamaStackClient(base_url="http://localhost:8321")
+from llama_stack_client import Agent, AgentEventLogger, RAGDocument

-models = client.models.list()

-# Select the first LLM and first embedding models
-model_id = next(m for m in models if m.model_type == "llm").identifier
-embedding_model_id = (
-    em := next(m for m in models if m.model_type == "embedding")
-).identifier
-embedding_dimension = em.metadata["embedding_dimension"]
+def create_http_client():
+    from llama_stack_client import LlamaStackClient

-_ = client.vector_dbs.register(
+    return LlamaStackClient(
+        base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}"
+    )
+
+
+def create_library_client(template="ollama"):
+    from llama_stack import LlamaStackAsLibraryClient
+
+    client = LlamaStackAsLibraryClient(template)
+    client.initialize()
+    return client
+
+
+client = (
+    create_library_client()
+)  # or create_http_client() depending on the environment you picked
+
+# Documents to be used for RAG
+urls = ["chat.rst", "llama3.rst", "memory_optimizations.rst", "lora_finetune.rst"]
+documents = [
+    RAGDocument(
+        document_id=f"num-{i}",
+        content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
+        mime_type="text/plain",
+        metadata={},
+    )
+    for i, url in enumerate(urls)
+]
+
+vector_providers = [
+    provider for provider in client.providers.list() if provider.api == "vector_io"
+]
+provider_id = vector_providers[0].provider_id  # Use the first available vector provider
+
+# Register a vector database
+vector_db_id = f"test-vector-db-{uuid.uuid4().hex}"
+client.vector_dbs.register(
    vector_db_id=vector_db_id,
-    embedding_model=embedding_model_id,
-    embedding_dimension=embedding_dimension,
-    provider_id="faiss",
-)
-source = "https://www.paulgraham.com/greatwork.html"
-print("rag_tool> Ingesting document:", source)
-document = RAGDocument(
-    document_id="document_1",
-    content=source,
-    mime_type="text/html",
-    metadata={},
+    provider_id=provider_id,
+    embedding_model="all-MiniLM-L6-v2",
+    embedding_dimension=384,
 )
+
+# Insert the documents into the vector database
 client.tool_runtime.rag_tool.insert(
-    documents=[document],
+    documents=documents,
    vector_db_id=vector_db_id,
-    chunk_size_in_tokens=50,
+    chunk_size_in_tokens=512,
 )
-agent = Agent(
+
+rag_agent = Agent(
    client,
-    model=model_id,
+    model=os.environ["INFERENCE_MODEL"],
+    # Define instructions for the agent ( aka system prompt)
    instructions="You are a helpful assistant",
+    enable_session_persistence=False,
+    # Define tools available to the agent
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
-            "args": {"vector_db_ids": [vector_db_id]},
+            "args": {
+                "vector_db_ids": [vector_db_id],
+            },
        }
    ],
 )
+session_id = rag_agent.create_session("test-session")

-prompt = "How do you do great work?"
-print("prompt>", prompt)
+user_prompts = [
+    "How to optimize memory usage in torchtune? use the knowledge_search tool to get information.",
+]

-response = agent.create_turn(
-    messages=[{"role": "user", "content": prompt}],
-    session_id=agent.create_session("rag_session"),
-    stream=True,
-)
-
-for log in AgentEventLogger().log(response):
-    log.print()
+# Run the agent loop by calling the `create_turn` method
+for prompt in user_prompts:
+    cprint(f"User> {prompt}", "green")
+    response = rag_agent.create_turn(
+        messages=[{"role": "user", "content": prompt}],
+        session_id=session_id,
+    )
+    for log in AgentEventLogger().log(response):
+        log.print()
 ```
-We will use `uv` to run the script
+
+To run the above example, put the code in a file called `rag.py`, ensure your `conda` or `virtualenv` environment is active, and run the following:
+```bash
+pip install llama_stack
+llama stack build --template ollama --image-type <conda|venv>
+python rag.py
 ```
-uv run --with llama-stack-client demo_script.py
-```
-And you should see output like below.
-```
-rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html
-
-prompt> How do you do great work?
-
-inference> [knowledge_search(query="What is the key to doing great work")]
-
-tool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}
-
-tool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 2:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 3:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 4:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 5:\nDocument_id:docum\nContent:  work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text='END of knowledge_search tool results.\n', type='text')]
-
-inference> Based on the search results, it seems that doing great work means doing something important so well that you expand people's ideas of what's possible. However, there is no clear threshold for importance, and it can be difficult to judge at the time.
-
-To further clarify, I would suggest that doing great work involves:
-
-* Completing tasks with high quality and attention to detail
-* Expanding on existing knowledge or ideas
-* Making a positive impact on others through your work
-* Striving for excellence and continuous improvement
-
-Ultimately, great work is about making a meaningful contribution and leaving a lasting impression.
-```
-Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳

 ## Next Steps

-Now you're ready to dive deeper into Llama Stack!
- Explore the [Detailed Tutorial](./detailed_tutorial.md).
- Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).
- Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).
- Learn about Llama Stack [Concepts](../concepts/index.md).
- Discover how to [Build Llama Stacks](../distributions/index.md).
- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.
- Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials.
+- Learn more about Llama Stack [Concepts](../concepts/index.md)
+- Learn how to [Build Llama Stacks](../distributions/index.md)
+- See [References](../references/index.md) for more details about the llama CLI and Python SDK
+- For example applications and more detailed tutorials, visit our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository.
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -1,18 +1,11 @@
-# Llama Stack
-Welcome to Llama Stack, the open-source framework for building generative AI applications.
-```{admonition} Llama 4 is here!
-:class: tip
-
-Check out [Getting Started with Llama 4](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started_llama4.ipynb)
-```
 ```{admonition} News
 :class: tip

 Llama Stack {{ llama_stack_version }} is now available! See the {{ llama_stack_version_link }} for more details.
 ```

+# Llama Stack

-## What is Llama Stack?

 Llama Stack defines and standardizes the core building blocks needed to bring generative AI applications to market. It provides a unified set of APIs with implementations from leading service providers, enabling seamless transitions between development and production environments. More specifically, it provides

@ -29,18 +22,14 @@ Llama Stack defines and standardizes the core building blocks needed to bring ge

 Our goal is to provide pre-packaged implementations (aka "distributions") which can be run in a variety of deployment environments. LlamaStack can assist you in your entire app development lifecycle - start iterating on local, mobile or desktop and seamlessly transition to on-prem or public cloud deployments. At every point in this transition, the same set of APIs and the same developer experience is available.

-## How does Llama Stack work?
-Llama Stack consists of a [server](./distributions/index.md) (with multiple pluggable API [providers](./providers/index.md)) and Client SDKs (see below) meant to
-be used in your applications. The server can be run in a variety of environments, including local (inline)
-development, on-premises, and cloud. The client SDKs are available for Python, Swift, Node, and
-Kotlin.
-
 ## Quick Links

+- New to Llama Stack? Start with the [Introduction](introduction/index) to understand our motivation and vision.
 - Ready to build? Check out the [Quick Start](getting_started/index) to get started.
+- Need specific providers? Browse [Distributions](distributions/selection) to see all the options available.
 - Want to contribute? See the [Contributing](contributing/index) guide.

-## Client SDKs
+## Available SDKs

 We have a number of client-side SDKs available for different languages.

@ -99,9 +88,8 @@ A number of "adapters" are available for some popular Inference and Vector Store
 :maxdepth: 3

 self
-getting_started/index
-getting_started/detailed_tutorial
 introduction/index
+getting_started/index
 concepts/index
 providers/index
 distributions/index
--- a/docs/source/playground/index.md
+++ b/docs/source/playground/index.md
@ -103,5 +103,7 @@ llama stack run together

 2. Start Streamlit UI
 ```bash
-uv run --with ".[ui]" streamlit run llama_stack/distribution/ui/app.py
+cd llama_stack/distribution/ui
+pip install -r requirements.txt
+streamlit run app.py
 ```
--- a/docs/source/providers/external.md
+++ b/docs/source/providers/external.md
@ -1,237 +0,0 @@
-# External Providers
-
-Llama Stack supports external providers that live outside of the main codebase. This allows you to:
- Create and maintain your own providers independently
- Share providers with others without contributing to the main codebase
- Keep provider-specific code separate from the core Llama Stack code
-
-## Configuration
-
-To enable external providers, you need to configure the `external_providers_dir` in your Llama Stack configuration. This directory should contain your external provider specifications:
-
-```yaml
-external_providers_dir: ~/.llama/providers.d/
-```
-
-## Directory Structure
-
-The external providers directory should follow this structure:
-
-```
-providers.d/
-  remote/
-    inference/
-      custom_ollama.yaml
-      vllm.yaml
-    vector_io/
-      qdrant.yaml
-    safety/
-      llama-guard.yaml
-  inline/
-    inference/
-      custom_ollama.yaml
-      vllm.yaml
-    vector_io/
-      qdrant.yaml
-    safety/
-      llama-guard.yaml
-```
-
-Each YAML file in these directories defines a provider specification for that particular API.
-
-## Provider Types
-
-Llama Stack supports two types of external providers:
-
-1. **Remote Providers**: Providers that communicate with external services (e.g., cloud APIs)
-2. **Inline Providers**: Providers that run locally within the Llama Stack process
-
-## Known External Providers
-
-Here's a list of known external providers that you can use with Llama Stack:
-
-| Name | Description | API | Type | Repository |
-|------|-------------|-----|------|------------|
-| KubeFlow Training | Train models with KubeFlow | Post Training | Remote | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) |
-| KubeFlow Pipelines | Train models with KubeFlow Pipelines | Post Training | Inline **and** Remote | [llama-stack-provider-kfp-trainer](https://github.com/opendatahub-io/llama-stack-provider-kfp-trainer) |
-| RamaLama | Inference models with RamaLama | Inference | Remote | [ramalama-stack](https://github.com/containers/ramalama-stack) |
-| TrustyAI LM-Eval | Evaluate models with TrustyAI LM-Eval | Eval | Remote | [llama-stack-provider-lmeval](https://github.com/trustyai-explainability/llama-stack-provider-lmeval) |
-
-### Remote Provider Specification
-
-Remote providers are used when you need to communicate with external services. Here's an example for a custom Ollama provider:
-
-```yaml
-adapter:
-  adapter_type: custom_ollama
-  pip_packages:
-  - ollama
-  - aiohttp
-  config_class: llama_stack_ollama_provider.config.OllamaImplConfig
-  module: llama_stack_ollama_provider
-api_dependencies: []
-optional_api_dependencies: []
-```
-
-#### Adapter Configuration
-
-The `adapter` section defines how to load and configure the provider:
-
- `adapter_type`: A unique identifier for this adapter
- `pip_packages`: List of Python packages required by the provider
- `config_class`: The full path to the configuration class
- `module`: The Python module containing the provider implementation
-
-### Inline Provider Specification
-
-Inline providers run locally within the Llama Stack process. Here's an example for a custom vector store provider:
-
-```yaml
-module: llama_stack_vector_provider
-config_class: llama_stack_vector_provider.config.VectorStoreConfig
-pip_packages:
-  - faiss-cpu
-  - numpy
-api_dependencies:
-  - inference
-optional_api_dependencies:
-  - vector_io
-provider_data_validator: llama_stack_vector_provider.validator.VectorStoreValidator
-container_image: custom-vector-store:latest  # optional
-```
-
-#### Inline Provider Fields
-
- `module`: The Python module containing the provider implementation
- `config_class`: The full path to the configuration class
- `pip_packages`: List of Python packages required by the provider
- `api_dependencies`: List of Llama Stack APIs that this provider depends on
- `optional_api_dependencies`: List of optional Llama Stack APIs that this provider can use
- `provider_data_validator`: Optional validator for provider data
- `container_image`: Optional container image to use instead of pip packages
-
-## Required Implementation
-
-### Remote Providers
-
-Remote providers must expose a `get_adapter_impl()` function in their module that takes two arguments:
-1. `config`: An instance of the provider's config class
-2. `deps`: A dictionary of API dependencies
-
-This function must return an instance of the provider's adapter class that implements the required protocol for the API.
-
-Example:
-```python
-async def get_adapter_impl(
-    config: OllamaImplConfig, deps: Dict[Api, Any]
-) -> OllamaInferenceAdapter:
-    return OllamaInferenceAdapter(config)
-```
-
-### Inline Providers
-
-Inline providers must expose a `get_provider_impl()` function in their module that takes two arguments:
-1. `config`: An instance of the provider's config class
-2. `deps`: A dictionary of API dependencies
-
-Example:
-```python
-async def get_provider_impl(
-    config: VectorStoreConfig, deps: Dict[Api, Any]
-) -> VectorStoreImpl:
-    impl = VectorStoreImpl(config, deps[Api.inference])
-    await impl.initialize()
-    return impl
-```
-
-## Dependencies
-
-The provider package must be installed on the system. For example:
-
-```bash
-$ uv pip show llama-stack-ollama-provider
-Name: llama-stack-ollama-provider
-Version: 0.1.0
-Location: /path/to/venv/lib/python3.10/site-packages
-```
-
-## Example: Custom Ollama Provider
-
-Here's a complete example of creating and using a custom Ollama provider:
-
-1. First, create the provider package:
-
-```bash
-mkdir -p llama-stack-provider-ollama
-cd llama-stack-provider-ollama
-git init
-uv init
-```
-
-2. Edit `pyproject.toml`:
-
-```toml
-[project]
-name = "llama-stack-provider-ollama"
-version = "0.1.0"
-description = "Ollama provider for Llama Stack"
-requires-python = ">=3.10"
-dependencies = ["llama-stack", "pydantic", "ollama", "aiohttp"]
-```
-
-3. Create the provider specification:
-
-```yaml
-# ~/.llama/providers.d/remote/inference/custom_ollama.yaml
-adapter:
-  adapter_type: custom_ollama
-  pip_packages: ["ollama", "aiohttp"]
-  config_class: llama_stack_provider_ollama.config.OllamaImplConfig
-  module: llama_stack_provider_ollama
-api_dependencies: []
-optional_api_dependencies: []
-```
-
-4. Install the provider:
-
-```bash
-uv pip install -e .
-```
-
-5. Configure Llama Stack to use external providers:
-
-```yaml
-external_providers_dir: ~/.llama/providers.d/
-```
-
-The provider will now be available in Llama Stack with the type `remote::custom_ollama`.
-
-## Best Practices
-
-1. **Package Naming**: Use the prefix `llama-stack-provider-` for your provider packages to make them easily identifiable.
-
-2. **Version Management**: Keep your provider package versioned and compatible with the Llama Stack version you're using.
-
-3. **Dependencies**: Only include the minimum required dependencies in your provider package.
-
-4. **Documentation**: Include clear documentation in your provider package about:
-   - Installation requirements
-   - Configuration options
-   - Usage examples
-   - Any limitations or known issues
-
-5. **Testing**: Include tests in your provider package to ensure it works correctly with Llama Stack.
-You can refer to the [integration tests
-guide](https://github.com/meta-llama/llama-stack/blob/main/tests/integration/README.md) for more
-information. Execute the test for the Provider type you are developing.
-
-## Troubleshooting
-
-If your external provider isn't being loaded:
-
-1. Check that the `external_providers_dir` path is correct and accessible.
-2. Verify that the YAML files are properly formatted.
-3. Ensure all required Python packages are installed.
-4. Check the Llama Stack server logs for any error messages - turn on debug logging to get more
-   information using `LLAMA_STACK_LOGGING=all=debug`.
-5. Verify that the provider package is installed in your Python environment.
--- a/docs/source/providers/index.md
+++ b/docs/source/providers/index.md
@ -1,8 +1,8 @@
 # Providers Overview

 The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
- LLM inference providers (e.g., Ollama, Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, etc.),
- Vector databases (e.g., ChromaDB, Weaviate, Qdrant, Milvus, FAISS, PGVector, SQLite-Vec, etc.),
+- LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, etc.),
+- Vector databases (e.g., ChromaDB, Weaviate, Qdrant, Milvus, FAISS, PGVector, etc.),
 - Safety providers (e.g., Meta's Llama Guard, AWS Bedrock Guardrails, etc.)

 Providers come in two flavors:
@ -11,10 +11,6 @@ Providers come in two flavors:

 Importantly, Llama Stack always strives to provide at least one fully inline provider for each API so you can iterate on a fully featured environment locally.

-## External Providers
-
-Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently. See the [External Providers Guide](external) for details.
-
 ## Agents
 Run multi-step agentic workflows with LLMs with tool usage, memory (RAG), etc.

@ -30,18 +26,6 @@ Runs inference with an LLM.
 ## Post Training
 Fine-tunes a model.

-#### Post Training Providers
-The following providers are available for Post Training:
-
-```{toctree}
-:maxdepth: 1
-
-external
-post_training/huggingface
-post_training/torchtune
-post_training/nvidia_nemo
-```
-
 ## Safety
 Applies safety policies to the output at a Systems (not only model) level.

@ -66,7 +50,6 @@ The following providers (i.e., databases) are available for Vector IO:
 ```{toctree}
 :maxdepth: 1

-external
 vector_io/faiss
 vector_io/sqlite-vec
 vector_io/chromadb
--- a/docs/source/providers/post_training/huggingface.md
+++ b/docs/source/providers/post_training/huggingface.md
@ -1,122 +0,0 @@
---
-orphan: true
---
-# HuggingFace SFTTrainer
-
-[HuggingFace SFTTrainer](https://huggingface.co/docs/trl/en/sft_trainer) is an inline post training provider for Llama Stack. It allows you to run supervised fine tuning on a variety of models using many datasets
-
-## Features
-
- Simple access through the post_training API
- Fully integrated with Llama Stack
- GPU support, CPU support, and MPS support (MacOS Metal Performance Shaders)
-
-## Usage
-
-To use the HF SFTTrainer in your Llama Stack project, follow these steps:
-
-1. Configure your Llama Stack project to use this provider.
-2. Kick off a SFT job using the Llama Stack post_training API.
-
-## Setup
-
-You can access the HuggingFace trainer via the `ollama` distribution:
-
-```bash
-llama stack build --template ollama --image-type venv
-llama stack run --image-type venv ~/.llama/distributions/ollama/ollama-run.yaml
-```
-
-## Run Training
-
-You can access the provider and the `supervised_fine_tune` method via the post_training API:
-
-```python
-import time
-import uuid
-
-
-from llama_stack_client.types import (
-    post_training_supervised_fine_tune_params,
-    algorithm_config_param,
-)
-
-
-def create_http_client():
-    from llama_stack_client import LlamaStackClient
-
-    return LlamaStackClient(base_url="http://localhost:8321")
-
-
-client = create_http_client()
-
-# Example Dataset
-client.datasets.register(
-    purpose="post-training/messages",
-    source={
-        "type": "uri",
-        "uri": "huggingface://datasets/llamastack/simpleqa?split=train",
-    },
-    dataset_id="simpleqa",
-)
-
-training_config = post_training_supervised_fine_tune_params.TrainingConfig(
-    data_config=post_training_supervised_fine_tune_params.TrainingConfigDataConfig(
-        batch_size=32,
-        data_format="instruct",
-        dataset_id="simpleqa",
-        shuffle=True,
-    ),
-    gradient_accumulation_steps=1,
-    max_steps_per_epoch=0,
-    max_validation_steps=1,
-    n_epochs=4,
-)
-
-algorithm_config = algorithm_config_param.LoraFinetuningConfig(  # this config is also currently mandatory but should not be
-    alpha=1,
-    apply_lora_to_mlp=True,
-    apply_lora_to_output=False,
-    lora_attn_modules=["q_proj"],
-    rank=1,
-    type="LoRA",
-)
-
-job_uuid = f"test-job{uuid.uuid4()}"
-
-# Example Model
-training_model = "ibm-granite/granite-3.3-8b-instruct"
-
-start_time = time.time()
-response = client.post_training.supervised_fine_tune(
-    job_uuid=job_uuid,
-    logger_config={},
-    model=training_model,
-    hyperparam_search_config={},
-    training_config=training_config,
-    algorithm_config=algorithm_config,
-    checkpoint_dir="output",
-)
-print("Job: ", job_uuid)
-
-
-# Wait for the job to complete!
-while True:
-    status = client.post_training.job.status(job_uuid=job_uuid)
-    if not status:
-        print("Job not found")
-        break
-
-    print(status)
-    if status.status == "completed":
-        break
-
-    print("Waiting for job to complete...")
-    time.sleep(5)
-
-end_time = time.time()
-print("Job completed in", end_time - start_time, "seconds!")
-
-print("Artifacts:")
-print(client.post_training.job.artifacts(job_uuid=job_uuid))
-```
--- a/docs/source/providers/post_training/nvidia_nemo.md
+++ b/docs/source/providers/post_training/nvidia_nemo.md
@ -1,163 +0,0 @@
---
-orphan: true
---
-# NVIDIA NEMO
-
-[NVIDIA NEMO](https://developer.nvidia.com/nemo-framework) is a remote post training provider for Llama Stack. It provides enterprise-grade fine-tuning capabilities through NVIDIA's NeMo Customizer service.
-
-## Features
-
- Enterprise-grade fine-tuning capabilities
- Support for LoRA and SFT fine-tuning
- Integration with NVIDIA's NeMo Customizer service
- Support for various NVIDIA-optimized models
- Efficient training with NVIDIA hardware acceleration
-
-## Usage
-
-To use NVIDIA NEMO in your Llama Stack project, follow these steps:
-
-1. Configure your Llama Stack project to use this provider.
-2. Set up your NVIDIA API credentials.
-3. Kick off a fine-tuning job using the Llama Stack post_training API.
-
-## Setup
-
-You'll need to set the following environment variables:
-
-```bash
-export NVIDIA_API_KEY="your-api-key"
-export NVIDIA_DATASET_NAMESPACE="default"
-export NVIDIA_CUSTOMIZER_URL="your-customizer-url"
-export NVIDIA_PROJECT_ID="your-project-id"
-export NVIDIA_OUTPUT_MODEL_DIR="your-output-model-dir"
-```
-
-## Run Training
-
-You can access the provider and the `supervised_fine_tune` method via the post_training API:
-
-```python
-import time
-import uuid
-
-from llama_stack_client.types import (
-    post_training_supervised_fine_tune_params,
-    algorithm_config_param,
-)
-
-
-def create_http_client():
-    from llama_stack_client import LlamaStackClient
-
-    return LlamaStackClient(base_url="http://localhost:8321")
-
-
-client = create_http_client()
-
-# Example Dataset
-client.datasets.register(
-    purpose="post-training/messages",
-    source={
-        "type": "uri",
-        "uri": "huggingface://datasets/llamastack/simpleqa?split=train",
-    },
-    dataset_id="simpleqa",
-)
-
-training_config = post_training_supervised_fine_tune_params.TrainingConfig(
-    data_config=post_training_supervised_fine_tune_params.TrainingConfigDataConfig(
-        batch_size=8,  # Default batch size for NEMO
-        data_format="instruct",
-        dataset_id="simpleqa",
-        shuffle=True,
-    ),
-    n_epochs=50,  # Default epochs for NEMO
-    optimizer_config=post_training_supervised_fine_tune_params.TrainingConfigOptimizerConfig(
-        lr=0.0001,  # Default learning rate
-        weight_decay=0.01,  # NEMO-specific parameter
-    ),
-    # NEMO-specific parameters
-    log_every_n_steps=None,
-    val_check_interval=0.25,
-    sequence_packing_enabled=False,
-    hidden_dropout=None,
-    attention_dropout=None,
-    ffn_dropout=None,
-)
-
-algorithm_config = algorithm_config_param.LoraFinetuningConfig(
-    alpha=16,  # Default alpha for NEMO
-    type="LoRA",
-)
-
-job_uuid = f"test-job{uuid.uuid4()}"
-
-# Example Model - must be a supported NEMO model
-training_model = "meta/llama-3.1-8b-instruct"
-
-start_time = time.time()
-response = client.post_training.supervised_fine_tune(
-    job_uuid=job_uuid,
-    logger_config={},
-    model=training_model,
-    hyperparam_search_config={},
-    training_config=training_config,
-    algorithm_config=algorithm_config,
-    checkpoint_dir="output",
-)
-print("Job: ", job_uuid)
-
-# Wait for the job to complete!
-while True:
-    status = client.post_training.job.status(job_uuid=job_uuid)
-    if not status:
-        print("Job not found")
-        break
-
-    print(status)
-    if status.status == "completed":
-        break
-
-    print("Waiting for job to complete...")
-    time.sleep(5)
-
-end_time = time.time()
-print("Job completed in", end_time - start_time, "seconds!")
-
-print("Artifacts:")
-print(client.post_training.job.artifacts(job_uuid=job_uuid))
-```
-
-## Supported Models
-
-Currently supports the following models:
- meta/llama-3.1-8b-instruct
- meta/llama-3.2-1b-instruct
-
-## Supported Parameters
-
-### TrainingConfig
- n_epochs (default: 50)
- data_config
- optimizer_config
- log_every_n_steps
- val_check_interval (default: 0.25)
- sequence_packing_enabled (default: False)
- hidden_dropout (0.0-1.0)
- attention_dropout (0.0-1.0)
- ffn_dropout (0.0-1.0)
-
-### DataConfig
- dataset_id
- batch_size (default: 8)
-
-### OptimizerConfig
- lr (default: 0.0001)
- weight_decay (default: 0.01)
-
-### LoRA Config
- alpha (default: 16)
- type (must be "LoRA")
-
-Note: Some parameters from the standard Llama Stack API are not supported and will be ignored with a warning.
--- a/docs/source/providers/post_training/torchtune.md
+++ b/docs/source/providers/post_training/torchtune.md
@ -1,125 +0,0 @@
---
-orphan: true
---
-# TorchTune
-
-[TorchTune](https://github.com/pytorch/torchtune) is an inline post training provider for Llama Stack. It provides a simple and efficient way to fine-tune language models using PyTorch.
-
-## Features
-
- Simple access through the post_training API
- Fully integrated with Llama Stack
- GPU support and single device capabilities.
- Support for LoRA
-
-## Usage
-
-To use TorchTune in your Llama Stack project, follow these steps:
-
-1. Configure your Llama Stack project to use this provider.
-2. Kick off a fine-tuning job using the Llama Stack post_training API.
-
-## Setup
-
-You can access the TorchTune trainer by writing your own yaml pointing to the provider:
-
-```yaml
-post_training:
-  - provider_id: torchtune
-    provider_type: inline::torchtune
-    config: {}
-```
-
-you can then build and run your own stack with this provider.
-
-## Run Training
-
-You can access the provider and the `supervised_fine_tune` method via the post_training API:
-
-```python
-import time
-import uuid
-
-from llama_stack_client.types import (
-    post_training_supervised_fine_tune_params,
-    algorithm_config_param,
-)
-
-
-def create_http_client():
-    from llama_stack_client import LlamaStackClient
-
-    return LlamaStackClient(base_url="http://localhost:8321")
-
-
-client = create_http_client()
-
-# Example Dataset
-client.datasets.register(
-    purpose="post-training/messages",
-    source={
-        "type": "uri",
-        "uri": "huggingface://datasets/llamastack/simpleqa?split=train",
-    },
-    dataset_id="simpleqa",
-)
-
-training_config = post_training_supervised_fine_tune_params.TrainingConfig(
-    data_config=post_training_supervised_fine_tune_params.TrainingConfigDataConfig(
-        batch_size=32,
-        data_format="instruct",
-        dataset_id="simpleqa",
-        shuffle=True,
-    ),
-    gradient_accumulation_steps=1,
-    max_steps_per_epoch=0,
-    max_validation_steps=1,
-    n_epochs=4,
-)
-
-algorithm_config = algorithm_config_param.LoraFinetuningConfig(
-    alpha=1,
-    apply_lora_to_mlp=True,
-    apply_lora_to_output=False,
-    lora_attn_modules=["q_proj"],
-    rank=1,
-    type="LoRA",
-)
-
-job_uuid = f"test-job{uuid.uuid4()}"
-
-# Example Model
-training_model = "meta-llama/Llama-2-7b-hf"
-
-start_time = time.time()
-response = client.post_training.supervised_fine_tune(
-    job_uuid=job_uuid,
-    logger_config={},
-    model=training_model,
-    hyperparam_search_config={},
-    training_config=training_config,
-    algorithm_config=algorithm_config,
-    checkpoint_dir="output",
-)
-print("Job: ", job_uuid)
-
-# Wait for the job to complete!
-while True:
-    status = client.post_training.job.status(job_uuid=job_uuid)
-    if not status:
-        print("Job not found")
-        break
-
-    print(status)
-    if status.status == "completed":
-        break
-
-    print("Waiting for job to complete...")
-    time.sleep(5)
-
-end_time = time.time()
-print("Job completed in", end_time - start_time, "seconds!")
-
-print("Artifacts:")
-print(client.post_training.job.artifacts(job_uuid=job_uuid))
-```
--- a/docs/source/providers/vector_io/milvus.md
+++ b/docs/source/providers/vector_io/milvus.md
@ -1,107 +0,0 @@
---
-orphan: true
---
-# Milvus
-
-[Milvus](https://milvus.io/) is an inline and remote vector database provider for Llama Stack. It
-allows you to store and query vectors directly within a Milvus database.
-That means you're not limited to storing vectors in memory or in a separate service.
-
-## Features
-
- Easy to use
- Fully integrated with Llama Stack
-
-## Usage
-
-To use Milvus in your Llama Stack project, follow these steps:
-
-1. Install the necessary dependencies.
-2. Configure your Llama Stack project to use Milvus.
-3. Start storing and querying vectors.
-
-## Installation
-
-You can install Milvus using pymilvus:
-
-```bash
-pip install pymilvus
-```
-
-## Configuration
-
-In Llama Stack, Milvus can be configured in two ways:
- **Inline (Local) Configuration** - Uses Milvus-Lite for local storage
- **Remote Configuration** - Connects to a remote Milvus server
-
-### Inline (Local) Configuration
-
-The simplest method is local configuration, which requires setting `db_path`, a path for locally storing Milvus-Lite files:
-
-```yaml
-vector_io:
-  - provider_id: milvus
-    provider_type: inline::milvus
-    config:
-      db_path: ~/.llama/distributions/together/milvus_store.db
-```
-
-### Remote Configuration
-
-Remote configuration is suitable for larger data storage requirements:
-
-#### Standard Remote Connection
-
-```yaml
-vector_io:
-  - provider_id: milvus
-    provider_type: remote::milvus
-    config:
-      uri: "http://<host>:<port>"
-      token: "<user>:<password>"
-```
-
-#### TLS-Enabled Remote Connection (One-way TLS)
-
-For connections to Milvus instances with one-way TLS enabled:
-
-```yaml
-vector_io:
-  - provider_id: milvus
-    provider_type: remote::milvus
-    config:
-      uri: "https://<host>:<port>"
-      token: "<user>:<password>"
-      secure: True
-      server_pem_path: "/path/to/server.pem"
-```
-
-#### Mutual TLS (mTLS) Remote Connection
-
-For connections to Milvus instances with mutual TLS (mTLS) enabled:
-
-```yaml
-vector_io:
-  - provider_id: milvus
-    provider_type: remote::milvus
-    config:
-      uri: "https://<host>:<port>"
-      token: "<user>:<password>"
-      secure: True
-      ca_pem_path: "/path/to/ca.pem"
-      client_pem_path: "/path/to/client.pem"
-      client_key_path: "/path/to/client.key"
-```
-
-#### Key Parameters for TLS Configuration
-
- **`secure`**: Enables TLS encryption when set to `true`. Defaults to `false`.
- **`server_pem_path`**: Path to the **server certificate** for verifying the server’s identity (used in one-way TLS).
- **`ca_pem_path`**: Path to the **Certificate Authority (CA) certificate** for validating the server certificate (required in mTLS).
- **`client_pem_path`**: Path to the **client certificate** file (required for mTLS).
- **`client_key_path`**: Path to the **client private key** file (required for mTLS).
-
-## Documentation
-See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.
-
-For more details on TLS configuration, refer to the [TLS setup guide](https://milvus.io/docs/tls.md).
--- a/docs/source/providers/vector_io/mivus.md
+++ b/docs/source/providers/vector_io/mivus.md
@ -0,0 +1,31 @@
+---
+orphan: true
+---
+# Milvus
+
+[Milvus](https://milvus.io/) is an inline and remote vector database provider for Llama Stack. It
+allows you to store and query vectors directly within a Milvus database.
+That means you're not limited to storing vectors in memory or in a separate service.
+
+## Features
+
+- Easy to use
+- Fully integrated with Llama Stack
+
+## Usage
+
+To use Milvus in your Llama Stack project, follow these steps:
+
+1. Install the necessary dependencies.
+2. Configure your Llama Stack project to use Milvus.
+3. Start storing and querying vectors.
+
+## Installation
+
+You can install Milvus using pymilvus:
+
+```bash
+pip install pymilvus
+```
+## Documentation
+See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.
--- a/docs/source/providers/vector_io/sqlite-vec.md
+++ b/docs/source/providers/vector_io/sqlite-vec.md
@ -66,25 +66,6 @@ To use sqlite-vec in your Llama Stack project, follow these steps:
 2. Configure your Llama Stack project to use SQLite-Vec.
 3. Start storing and querying vectors.

-## Supported Search Modes
-
-The sqlite-vec provider supports both vector-based and keyword-based (full-text) search modes.
-
-When using the RAGTool interface, you can specify the desired search behavior via the `mode` parameter in
-`RAGQueryConfig`. For example:
-
-```python
-from llama_stack.apis.tool_runtime.rag import RAGQueryConfig
-
-query_config = RAGQueryConfig(max_chunks=6, mode="vector")
-
-results = client.tool_runtime.rag_tool.query(
-    vector_db_ids=[vector_db_id],
-    content="what is torchtune",
-    query_config=query_config,
-)
-```
-
 ## Installation

 You can install SQLite-Vec using pip:
--- a/docs/source/references/llama_stack_client_cli_reference.md
+++ b/docs/source/references/llama_stack_client_cli_reference.md
@ -253,6 +253,8 @@ llama-stack-client toolgroups list
 +---------------------------+------------------+------+---------------+
 | identifier                | provider_id      | args | mcp_endpoint  |
 +===========================+==================+======+===============+
+| builtin::code_interpreter | code-interpreter | None | None          |
+---------------------------+------------------+------+---------------+
 | builtin::rag              | rag-runtime      | None | None          |
 +---------------------------+------------------+------+---------------+
 | builtin::websearch        | tavily-search    | None | None          |
--- a/docs/zero_to_hero_guide/00_Inference101.ipynb
+++ b/docs/zero_to_hero_guide/00_Inference101.ipynb
@ -389,7 +389,5 @@
      "pygments_lexer": "ipython3",
      "version": "3.10.15"
    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 5
+  }
 }
--- a/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
+++ b/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
@ -256,7 +256,5 @@
      "pygments_lexer": "ipython3",
      "version": "3.10.15"
    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 5
+  }
 }
--- a/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
+++ b/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
@ -301,7 +301,5 @@
      "pygments_lexer": "ipython3",
      "version": "3.12.2"
    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 5
+  }
 }
--- a/docs/zero_to_hero_guide/03_Image_Chat101.ipynb
+++ b/docs/zero_to_hero_guide/03_Image_Chat101.ipynb
@ -200,7 +200,5 @@
      "pygments_lexer": "ipython3",
      "version": "3.12.2"
    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 5
+  }
 }
--- a/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb
+++ b/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb
@ -355,7 +355,5 @@
      "pygments_lexer": "ipython3",
      "version": "3.10.15"
    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 5
+  }
 }
--- a/docs/zero_to_hero_guide/05_Memory101.ipynb
+++ b/docs/zero_to_hero_guide/05_Memory101.ipynb
@ -398,7 +398,5 @@
      "pygments_lexer": "ipython3",
      "version": "3.10.15"
    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 5
+  }
 }
--- a/docs/zero_to_hero_guide/06_Safety101.ipynb
+++ b/docs/zero_to_hero_guide/06_Safety101.ipynb
@ -132,7 +132,5 @@
      "pygments_lexer": "ipython3",
      "version": "3.11.10"
    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 5
+  }
 }
--- a/docs/zero_to_hero_guide/07_Agents101.ipynb
+++ b/docs/zero_to_hero_guide/07_Agents101.ipynb
@ -188,7 +188,5 @@
      "pygments_lexer": "ipython3",
      "version": "3.10.15"
    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 5
+  }
 }
--- a/Show more
+++ b/Show more
Author	SHA1	Message	Date
github-actions[bot]	337aa6d183	build: Bump version to 0.1.9	2025-03-29 00:22:07 +00:00
github-actions[bot]	54747c28fc	Release candidate 0.1.19rc8	2025-03-28 23:53:32 +00:00