fix security update

fix install
add requirements
2025-06-03 20:07:06 +02:00 · 2025-06-02 03:02:25 +02:00 · 2025-06-02 03:01:15 +02:00 · 2025-06-02 02:54:55 +02:00 · 2025-06-02 02:49:54 +02:00 · 2025-06-02 02:49:45 +02:00
772 changed files with 61872 additions and 73052 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -0,0 +1,6 @@
 [run]
 omit =
    */tests/*
    */llama_stack/providers/*
    */llama_stack/templates/*
    .venv/*
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@
 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan @SLR722 @leseb
+* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -1,10 +1,8 @@
 # What does this PR do?
-[Provide a short summary of what this PR does and why. Link to relevant issues if applicable.]
+<!-- Provide a short summary of what this PR does and why. Link to relevant issues if applicable. -->
-[//]: # (If resolving an issue, uncomment and update the line below)
+<!-- If resolving an issue, uncomment and update the line below -->
-[//]: # (Closes #[issue-number])
+<!-- Closes #[issue-number] -->
 ## Test Plan
-[Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.*]
+<!-- Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.* -->
 [//]: # (## Documentation)
--- a/.github/TRIAGERS.md
+++ b/.github/TRIAGERS.md
@ -1,2 +1,2 @@
 # This file documents Triage members in the Llama Stack community
-@franciscojavierarceo @leseb
+ @bbrowning @booxter @franciscojavierarceo @leseb
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@ -0,0 +1,26 @@
 name: Setup Ollama
 description: Start Ollama and cache model
 inputs:
  models:
    description: Comma-separated list of models to pull
    default: "llama3.2:3b-instruct-fp16,all-minilm:latest"
 runs:
  using: "composite"
  steps:
    - name: Install and start Ollama
      shell: bash
      run: |
        # the ollama installer also starts the ollama service
        curl -fsSL https://ollama.com/install.sh | sh
    # Do NOT cache models - pulling the cache is actually slower than just pulling the model.
    # It takes ~45 seconds to pull the models from the cache and unpack it, but only 30 seconds to
    # pull them directly.
    # Maybe this is because the cache is being pulled at the same time by all the matrix jobs?
    - name: Pull requested models
      if: inputs.models != ''
      shell: bash
      run: |
        for model in $(echo "${{ inputs.models }}" | tr ',' ' '); do
          ollama pull "$model"
        done
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -0,0 +1,22 @@
 name: Setup runner
 description: Prepare a runner for the tests (install uv, python, project dependencies, etc.)
 runs:
  using: "composite"
  steps:
    - name: Install uv
      uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
      with:
        python-version: "3.10"
        activate-environment: true
        version: 0.7.6
    - name: Install dependencies
      shell: bash
      run: |
        uv sync --all-groups
        uv pip install ollama faiss-cpu
        # always test against the latest version of the client
        # TODO: this is not necessarily a good idea. we need to test against both published and latest
        # to find out backwards compatibility issues.
        uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
        uv pip install -e .
--- a/.github/workflows/Dockerfile
+++ b/.github/workflows/Dockerfile
@ -0,0 +1 @@
 FROM localhost:5000/distribution-kvant:dev
--- a/.github/workflows/ci-playground.yaml
+++ b/.github/workflows/ci-playground.yaml
@ -0,0 +1,73 @@
 name: Build and Push playground container
 run-name: Build and Push playground container
 on:
  workflow_dispatch:
  #schedule:
  #  - cron: "0 10 * * *"
  push:
    branches:
      - main
      - kvant
    tags:
      - 'v*'
  pull_request:
    branches:
      - main
      - kvant
 env:
  IMAGE: git.kvant.cloud/${{github.repository}}-playground
 jobs:
  build-playground:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Set current time
        uses: https://github.com/gerred/actions/current-time@master
        id: current_time
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to git.kvant.cloud registry
        uses: docker/login-action@v3
        with:
          registry: git.kvant.cloud
          username: ${{ vars.ORG_PACKAGE_WRITER_USERNAME }}
          password: ${{ secrets.ORG_PACKAGE_WRITER_TOKEN }}
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
        with:
          # list of Docker images to use as base name for tags
          images: |
            ${{env.IMAGE}}
          # generate Docker tags based on the following events/attributes
          tags: |
            type=schedule
            type=ref,event=branch
            type=ref,event=pr
            type=ref,event=tag
            type=semver,pattern={{version}}
      - name: Build and push to gitea registry
        uses: docker/build-push-action@v6
        with:
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
          context: .
          file: llama_stack/distribution/ui/Containerfile
          provenance: mode=max
          sbom: true
          build-args: |
            BUILD_DATE=${{ steps.current_time.outputs.time }}
          cache-from: |
            type=registry,ref=${{ env.IMAGE }}:buildcache
            type=registry,ref=${{ env.IMAGE }}:${{ github.ref_name }}
            type=registry,ref=${{ env.IMAGE }}:main
          cache-to: type=registry,ref=${{ env.IMAGE }}:buildcache,mode=max,image-manifest=true
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@ -0,0 +1,98 @@
 name: Build and Push container
 run-name: Build and Push container
 on:
  workflow_dispatch:
  #schedule:
  #  - cron: "0 10 * * *"
  push:
    branches:
      - main
      - kvant
    tags:
      - 'v*'
  pull_request:
    branches:
      - main
      - kvant
 env:
  IMAGE: git.kvant.cloud/${{github.repository}}
 jobs:
  build:
    runs-on: ubuntu-latest
    services:
      registry:
        image: registry:2
        ports:
          - 5000:5000
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Set current time
        uses: https://github.com/gerred/actions/current-time@master
        id: current_time
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
        with:
          driver-opts: network=host
      - name: Login to git.kvant.cloud registry
        uses: docker/login-action@v3
        with:
          registry: git.kvant.cloud
          username: ${{ vars.ORG_PACKAGE_WRITER_USERNAME }}
          password: ${{ secrets.ORG_PACKAGE_WRITER_TOKEN }}
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
        with:
          # list of Docker images to use as base name for tags
          images: |
            ${{env.IMAGE}}
          # generate Docker tags based on the following events/attributes
          tags: |
            type=schedule
            type=ref,event=branch
            type=ref,event=pr
            type=ref,event=tag
            type=semver,pattern={{version}}
      - name: Install uv
        uses: https://github.com/astral-sh/setup-uv@v5
        with:
          # Install a specific version of uv.
          version: "0.7.8"
      - name: Build
        env:
          USE_COPY_NOT_MOUNT: true
          LLAMA_STACK_DIR: .
        run: |
          uvx --from . llama stack build --template kvant --image-type container
          # docker tag distribution-kvant:dev ${{env.IMAGE}}:kvant
          # docker push ${{env.IMAGE}}:kvant
          docker tag distribution-kvant:dev localhost:5000/distribution-kvant:dev
          docker push localhost:5000/distribution-kvant:dev
      - name: Build and push to gitea registry
        uses: docker/build-push-action@v6
        with:
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
          context: .github/workflows
          provenance: mode=max
          sbom: true
          build-args: |
            BUILD_DATE=${{ steps.current_time.outputs.time }}
          cache-from: |
            type=registry,ref=${{ env.IMAGE }}:buildcache
            type=registry,ref=${{ env.IMAGE }}:${{ github.ref_name }}
            type=registry,ref=${{ env.IMAGE }}:main
          cache-to: type=registry,ref=${{ env.IMAGE }}:buildcache,mode=max,image-manifest=true
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -1,83 +0,0 @@
 name: Test Llama Stack Build
 on:
  push:
    branches:
      - main
    paths:
      - 'llama_stack/cli/stack/build.py'
      - 'llama_stack/cli/stack/_build.py'
      - 'llama_stack/distribution/build.*'
      - 'llama_stack/distribution/*.sh'
      - '.github/workflows/providers-build.yml'
  pull_request:
    paths:
      - 'llama_stack/cli/stack/build.py'
      - 'llama_stack/cli/stack/_build.py'
      - 'llama_stack/distribution/build.*'
      - 'llama_stack/distribution/*.sh'
      - '.github/workflows/providers-build.yml'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  generate-matrix:
    runs-on: ubuntu-latest
    outputs:
      templates: ${{ steps.set-matrix.outputs.templates }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Generate Template List
        id: set-matrix
        run: |
          templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
          echo "templates=$templates" >> "$GITHUB_OUTPUT"
  build:
    needs: generate-matrix
    runs-on: ubuntu-latest
    strategy:
      matrix:
        template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
        image-type: [venv, container]
      fail-fast: false # We want to run all jobs even if some fail
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Set up Python
        uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
        with:
          python-version: '3.10'
      - name: Install uv
        uses: astral-sh/setup-uv@22695119d769bdb6f7032ad67b9bca0ef8c4a174 # v5.4.0
        with:
          python-version: "3.10"
      - name: Install LlamaStack
        run: |
          uv venv
          source .venv/bin/activate
          uv pip install -e .
      - name: Print build dependencies
        run: |
          uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
      - name: Run Llama Stack Build
        run: |
          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test
      - name: Print dependencies in the image
        if: matrix.image-type == 'venv'
        run: |
          source test/bin/activate
          uv pip list
--- a/.github/workflows/test-external-providers.yml
+++ b/.github/workflows/test-external-providers.yml
@ -1,93 +0,0 @@
 name: Test External Providers
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
 jobs:
  test-external-providers:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Install uv
        uses: astral-sh/setup-uv@v5
        with:
          python-version: "3.10"
      - name: Install Ollama
        run: |
          curl -fsSL https://ollama.com/install.sh | sh
      - name: Pull Ollama image
        run: |
          ollama pull llama3.2:3b-instruct-fp16
      - name: Start Ollama in background
        run: |
          nohup ollama run llama3.2:3b-instruct-fp16 --keepalive=30m > ollama.log 2>&1 &
      - name: Set Up Environment and Install Dependencies
        run: |
          uv sync --extra dev --extra test
          uv pip install -e .
      - name: Install Ollama custom provider
        run: |
          mkdir -p tests/external-provider/llama-stack-provider-ollama/src/
          cp -a llama_stack/providers/remote/inference/ollama/ tests/external-provider/llama-stack-provider-ollama/src/llama_stack_provider_ollama
          uv pip install tests/external-provider/llama-stack-provider-ollama
      - name: Create provider configuration
        run: |
          mkdir -p /tmp/providers.d/remote/inference
          cp tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml /tmp/providers.d/remote/inference/custom_ollama.yaml
      - name: Wait for Ollama to start
        run: |
          echo "Waiting for Ollama..."
          for i in {1..30}; do
            if curl -s http://localhost:11434 | grep -q "Ollama is running"; then
              echo "Ollama is running!"
              exit 0
            fi
            sleep 1
          done
          echo "Ollama failed to start"
          ollama ps
          ollama.log
          exit 1
      - name: Start Llama Stack server in background
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        run: |
          source .venv/bin/activate
          nohup uv run llama stack run tests/external-provider/llama-stack-provider-ollama/run.yaml --image-type venv > server.log 2>&1 &
      - name: Wait for Llama Stack server to be ready
        run: |
          echo "Waiting for Llama Stack server..."
          for i in {1..30}; do
            if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
              echo "Llama Stack server is up!"
              if grep -q "remote::custom_ollama from /tmp/providers.d/remote/inference/custom_ollama.yaml" server.log; then
                echo "Llama Stack server is using custom Ollama provider"
                exit 0
              else
                echo "Llama Stack server is not using custom Ollama provider"
                exit 1
              fi
            fi
            sleep 1
          done
          echo "Llama Stack server failed to start"
          cat server.log
          exit 1
      - name: run inference tests
        run: |
          uv run pytest -v tests/integration/inference/test_text_inference.py --stack-config="http://localhost:8321" --text-model="meta-llama/Llama-3.2-3B-Instruct" --embedding-model=all-MiniLM-L6-v2
--- a/.github/workflows_upstream/changelog.yml
+++ b/.github/workflows_upstream/changelog.yml
--- a/.github/workflows_upstream/gha_workflow_llama_stack_tests.yml
+++ b/.github/workflows_upstream/gha_workflow_llama_stack_tests.yml
--- a/.github/workflows_upstream/install-script-ci.yml
+++ b/.github/workflows_upstream/install-script-ci.yml
@ -0,0 +1,26 @@
 name: Installer CI
 on:
  pull_request:
    paths:
      - 'install.sh'
  push:
    paths:
      - 'install.sh'
  schedule:
    - cron: '0 2 * * *'  # every day at 02:00 UTC
 jobs:
  lint:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
      - name: Run ShellCheck on install.sh
        run: shellcheck install.sh
  smoke-test:
    needs: lint
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
      - name: Run installer end-to-end
        run: ./install.sh
--- a/.github/workflows_upstream/integration-auth-tests.yml
+++ b/.github/workflows_upstream/integration-auth-tests.yml
@ -0,0 +1,132 @@
 name: Integration Auth Tests
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
    paths:
      - 'distributions/**'
      - 'llama_stack/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/integration-auth-tests.yml' # This workflow
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  test-matrix:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        auth-provider: [oauth2_token]
      fail-fast: false # we want to run all tests regardless of failure
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Build Llama Stack
        run: |
          llama stack build --template ollama --image-type venv
      - name: Install minikube
        if: ${{ matrix.auth-provider == 'kubernetes' }}
        uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # v0.0.19
      - name: Start minikube
        if: ${{ matrix.auth-provider == 'oauth2_token' }}
        run: |
          minikube start
          kubectl get pods -A
      - name: Configure Kube Auth
        if: ${{ matrix.auth-provider == 'oauth2_token' }}
        run: |
          kubectl create namespace llama-stack
          kubectl create serviceaccount llama-stack-auth -n llama-stack
          kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --serviceaccount=llama-stack:llama-stack-auth -n llama-stack
          kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
          cat <<EOF | kubectl apply -f -
          apiVersion: rbac.authorization.k8s.io/v1
          kind: ClusterRole
          metadata:
            name: allow-anonymous-openid
          rules:
          - nonResourceURLs: ["/openid/v1/jwks"]
            verbs: ["get"]
          ---
          apiVersion: rbac.authorization.k8s.io/v1
          kind: ClusterRoleBinding
          metadata:
            name: allow-anonymous-openid
          roleRef:
            apiGroup: rbac.authorization.k8s.io
            kind: ClusterRole
            name: allow-anonymous-openid
          subjects:
          - kind: User
            name: system:anonymous
            apiGroup: rbac.authorization.k8s.io
          EOF
      - name: Set Kubernetes Config
        if: ${{ matrix.auth-provider == 'oauth2_token' }}
        run: |
          echo "KUBERNETES_API_SERVER_URL=$(kubectl get --raw /.well-known/openid-configuration| jq -r .jwks_uri)" >> $GITHUB_ENV
          echo "KUBERNETES_CA_CERT_PATH=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}')" >> $GITHUB_ENV
          echo "KUBERNETES_ISSUER=$(kubectl get --raw /.well-known/openid-configuration| jq -r .issuer)" >> $GITHUB_ENV
          echo "KUBERNETES_AUDIENCE=$(kubectl create token llama-stack-auth -n llama-stack --duration=1h | cut -d. -f2 | base64 -d | jq -r '.aud[0]')" >> $GITHUB_ENV
      - name: Set Kube Auth Config and run server
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        if: ${{ matrix.auth-provider == 'oauth2_token' }}
        run: |
          run_dir=$(mktemp -d)
          cat <<'EOF' > $run_dir/run.yaml
          version: '2'
          image_name: kube
          apis: []
          providers: {}
          server:
            port: 8321
          EOF
          yq eval '.server.auth = {"provider_type": "${{ matrix.auth-provider }}"}' -i $run_dir/run.yaml
          yq eval '.server.auth.config = {"tls_cafile": "${{ env.KUBERNETES_CA_CERT_PATH }}", "issuer": "${{ env.KUBERNETES_ISSUER }}", "audience": "${{ env.KUBERNETES_AUDIENCE }}"}' -i $run_dir/run.yaml
          yq eval '.server.auth.config.jwks = {"uri": "${{ env.KUBERNETES_API_SERVER_URL }}"}' -i $run_dir/run.yaml
          cat $run_dir/run.yaml
          nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &
      - name: Wait for Llama Stack server to be ready
        run: |
          echo "Waiting for Llama Stack server..."
          for i in {1..30}; do
            if curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://localhost:8321/v1/health | grep -q "OK"; then
              echo "Llama Stack server is up!"
              if grep -q "Enabling authentication with provider: ${{ matrix.auth-provider }}" server.log; then
                echo "Llama Stack server is configured to use ${{ matrix.auth-provider }} auth"
                exit 0
              else
                echo "Llama Stack server is not configured to use ${{ matrix.auth-provider }} auth"
                cat server.log
                exit 1
              fi
            fi
            sleep 1
          done
          echo "Llama Stack server failed to start"
          cat server.log
          exit 1
      - name: Test auth
        run: |
          curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers|jq
--- a/.github/workflows_upstream/integration-tests.yml
+++ b/.github/workflows_upstream/integration-tests.yml
@ -6,7 +6,6 @@ on:
  pull_request:
    branches: [ main ]
    paths:
      - 'distributions/**'
      - 'llama_stack/**'
      - 'tests/integration/**'
      - 'uv.lock'
@ -25,7 +24,7 @@ jobs:
      matrix:
        # Listing tests manually since some of them currently fail
        # TODO: generate matrix list from tests/integration when fixed
-        test-type: [agents, inference, datasets, inspect, scoring, post_training, providers]
+        test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime]
        client-type: [library, http]
      fail-fast: false # we want to run all tests regardless of failure
@ -33,56 +32,22 @@ jobs:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Install uv
+      - name: Install dependencies
-        uses: astral-sh/setup-uv@22695119d769bdb6f7032ad67b9bca0ef8c4a174 # v5.4.0
+        uses: ./.github/actions/setup-runner
        with:
          python-version: "3.10"
-      - name: Install Ollama
+      - name: Setup ollama
-        run: |
+        uses: ./.github/actions/setup-ollama
          curl -fsSL https://ollama.com/install.sh | sh
-      - name: Pull Ollama image
+      - name: Build Llama Stack
        run: |
          ollama pull llama3.2:3b-instruct-fp16
      - name: Start Ollama in background
        run: |
          nohup ollama run llama3.2:3b-instruct-fp16 > ollama.log 2>&1 &
      - name: Set Up Environment and Install Dependencies
        run: |
          uv sync --extra dev --extra test
          uv pip install ollama faiss-cpu
          # always test against the latest version of the client
          # TODO: this is not necessarily a good idea. we need to test against both published and latest
          # to find out backwards compatibility issues.
          uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
          uv pip install -e .
          llama stack build --template ollama --image-type venv
      - name: Wait for Ollama to start
        run: |
          echo "Waiting for Ollama..."
          for i in {1..30}; do
            if curl -s http://localhost:11434 | grep -q "Ollama is running"; then
              echo "Ollama is running!"
              exit 0
            fi
            sleep 1
          done
          echo "Ollama failed to start"
          ollama ps
          ollama.log
          exit 1
      - name: Start Llama Stack server in background
        if: matrix.client-type == 'http'
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        run: |
-          source .venv/bin/activate
+          LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv &
          nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv > server.log 2>&1 &
      - name: Wait for Llama Stack server to be ready
        if: matrix.client-type == 'http'
@ -99,6 +64,23 @@ jobs:
          cat server.log
          exit 1
      - name: Verify Ollama status is OK
        if: matrix.client-type == 'http'
        run: |
          echo "Verifying Ollama status..."
          ollama_status=$(curl -s -L http://127.0.0.1:8321/v1/providers/ollama|jq --raw-output .health.status)
          echo "Ollama status: $ollama_status"
          if [ "$ollama_status" != "OK" ]; then
            echo "Ollama health check failed"
            exit 1
          fi
      - name: Check Storage and Memory Available Before Tests
        if: ${{ always() }}
        run: |
          free -h
          df -h
      - name: Run Integration Tests
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
@ -108,7 +90,27 @@ jobs:
          else
            stack_config="http://localhost:8321"
          fi
-          uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
+          uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
            -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
            --text-model="meta-llama/Llama-3.2-3B-Instruct" \
            --embedding-model=all-MiniLM-L6-v2
      - name: Check Storage and Memory Available After Tests
        if: ${{ always() }}
        run: |
          free -h
          df -h
      - name: Write ollama logs to file
        if: ${{ always() }}
        run: |
          sudo journalctl -u ollama.service > ollama.log
      - name: Upload all logs to artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}
          path: |
            *.log
          retention-days: 1
--- a/.github/workflows_upstream/pre-commit.yml
+++ b/.github/workflows_upstream/pre-commit.yml
@ -18,7 +18,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Set up Python
-        uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
        with:
          python-version: '3.11'
          cache: pip
@ -27,7 +27,19 @@ jobs:
            .pre-commit-config.yaml
      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
        env:
          SKIP: no-commit-to-branch
          RUFF_OUTPUT_FORMAT: github
      - name: Verify if there are any diff files after pre-commit
        run: |
          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
      - name: Verify if there are any new files after pre-commit
        run: |
          unstaged_files=$(git ls-files --others --exclude-standard)
          if [ -n "$unstaged_files" ]; then
            echo "There are uncommitted new files, run pre-commit locally and commit again"
            echo "$unstaged_files"
            exit 1
          fi
--- a/.github/workflows_upstream/providers-build.yml
+++ b/.github/workflows_upstream/providers-build.yml
@ -0,0 +1,147 @@
 name: Test Llama Stack Build
 on:
  push:
    branches:
      - main
    paths:
      - 'llama_stack/cli/stack/build.py'
      - 'llama_stack/cli/stack/_build.py'
      - 'llama_stack/distribution/build.*'
      - 'llama_stack/distribution/*.sh'
      - '.github/workflows/providers-build.yml'
  pull_request:
    paths:
      - 'llama_stack/cli/stack/build.py'
      - 'llama_stack/cli/stack/_build.py'
      - 'llama_stack/distribution/build.*'
      - 'llama_stack/distribution/*.sh'
      - '.github/workflows/providers-build.yml'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  generate-matrix:
    runs-on: ubuntu-latest
    outputs:
      templates: ${{ steps.set-matrix.outputs.templates }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Generate Template List
        id: set-matrix
        run: |
          templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
          echo "templates=$templates" >> "$GITHUB_OUTPUT"
  build:
    needs: generate-matrix
    runs-on: ubuntu-latest
    strategy:
      matrix:
        template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
        image-type: [venv, container]
      fail-fast: false # We want to run all jobs even if some fail
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Print build dependencies
        run: |
          uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
      - name: Run Llama Stack Build
        run: |
          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test
      - name: Print dependencies in the image
        if: matrix.image-type == 'venv'
        run: |
          uv pip list
  build-single-provider:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Build a single provider
        run: |
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --image-type venv --image-name test --providers inference=remote::ollama
  build-custom-container-distribution:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Build a single provider
        run: |
          yq -i '.image_type = "container"' llama_stack/templates/starter/build.yaml
          yq -i '.image_name = "test"' llama_stack/templates/starter/build.yaml
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/starter/build.yaml
      - name: Inspect the container image entrypoint
        run: |
          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
  build-ubi9-container-distribution:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Pin template to UBI9 base
        run: |
          yq -i '
            .image_type    = "container" |
            .image_name    = "ubi9-test" |
            .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
          ' llama_stack/templates/starter/build.yaml
      - name: Build dev container (UBI9)
        env:
          USE_COPY_NOT_MOUNT: "true"
          LLAMA_STACK_DIR: "."
        run: |
          uv run llama stack build --config llama_stack/templates/starter/build.yaml
      - name: Inspect UBI9 image
        run: |
          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
          echo "Checking /etc/os-release in $IMAGE_ID"
          docker run --rm --entrypoint sh "$IMAGE_ID" -c \
              'source /etc/os-release && echo "$ID"' \
              | grep -qE '^(rhel|ubi)$' \
              || { echo "Base image is not UBI 9!"; exit 1; }
--- a/.github/workflows_upstream/semantic-pr.yml
+++ b/.github/workflows_upstream/semantic-pr.yml
--- a/.github/workflows_upstream/stale_bot.yml
+++ b/.github/workflows_upstream/stale_bot.yml
--- a/.github/workflows_upstream/test-external-providers.yml
+++ b/.github/workflows_upstream/test-external-providers.yml
@ -0,0 +1,71 @@
 name: Test External Providers
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/test-external-providers.yml' # This workflow
 jobs:
  test-external-providers:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        image-type: [venv]
        # We don't do container yet, it's tricky to install a package from the host into the
        # container and point 'uv pip install' to the correct path...
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Apply image type to config file
        run: |
          yq -i '.image_type = "${{ matrix.image-type }}"' tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
          cat tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
      - name: Setup directory for Ollama custom provider
        run: |
          mkdir -p tests/external-provider/llama-stack-provider-ollama/src/
          cp -a llama_stack/providers/remote/inference/ollama/ tests/external-provider/llama-stack-provider-ollama/src/llama_stack_provider_ollama
      - name: Create provider configuration
        run: |
          mkdir -p /home/runner/.llama/providers.d/remote/inference
          cp tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml
      - name: Build distro from config file
        run: |
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        run: |
          uv run pip list
          nohup uv run --active llama stack run tests/external-provider/llama-stack-provider-ollama/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
      - name: Wait for Llama Stack server to be ready
        run: |
          for i in {1..30}; do
            if ! grep -q "remote::custom_ollama from /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml" server.log; then
              echo "Waiting for Llama Stack server to load the provider..."
              sleep 1
            else
              echo "Provider loaded"
              exit 0
            fi
          done
          echo "Provider failed to load"
          cat server.log
          exit 1
--- a/.github/workflows_upstream/tests.yml
+++ b/.github/workflows_upstream/tests.yml
--- a/.github/workflows_upstream/unit-tests.yml
+++ b/.github/workflows_upstream/unit-tests.yml
@ -6,7 +6,6 @@ on:
  pull_request:
    branches: [ main ]
    paths:
      - 'distributions/**'
      - 'llama_stack/**'
      - 'tests/unit/**'
      - 'uv.lock'
@ -31,17 +30,11 @@ jobs:
          - "3.12"
          - "3.13"
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Set up Python ${{ matrix.python }}
+      - name: Install dependencies
-        uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
+        uses: ./.github/actions/setup-runner
        with:
          python-version: ${{ matrix.python }}
      - uses: astral-sh/setup-uv@22695119d769bdb6f7032ad67b9bca0ef8c4a174 # v5.4.0
        with:
          python-version: ${{ matrix.python }}
          enable-cache: false
      - name: Run unit tests
        run: |
--- a/.github/workflows_upstream/update-readthedocs.yml
+++ b/.github/workflows_upstream/update-readthedocs.yml
@ -14,6 +14,8 @@ on:
      - 'docs/**'
      - 'pyproject.toml'
      - '.github/workflows/update-readthedocs.yml'
    tags:
      - '*'
  pull_request:
    branches:
      - main
@ -35,16 +37,8 @@ jobs:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Set up Python
+      - name: Install dependencies
-        uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
+        uses: ./.github/actions/setup-runner
        with:
          python-version: '3.11'
      - name: Install the latest version of uv
        uses: astral-sh/setup-uv@22695119d769bdb6f7032ad67b9bca0ef8c4a174 # v5.4.0
      - name: Sync with uv
        run: uv sync --extra docs
      - name: Build HTML
        run: |
@ -61,7 +55,10 @@ jobs:
          response=$(curl -X POST \
            -H "Content-Type: application/json" \
-            -d "{\"token\": \"$TOKEN\"}" \
+            -d "{
              \"token\": \"$TOKEN\",
              \"version\": \"$GITHUB_REF_NAME\"
            }" \
            https://readthedocs.org/api/v2/webhook/llama-stack/289768/)
          echo "Response: $response"
--- a/.gitignore
+++ b/.gitignore
@ -6,6 +6,7 @@ dev_requirements.txt
 build
 .DS_Store
 llama_stack/configs/*
 .cursor/
 xcuserdata/
 *.hmap
 .DS_Store
@ -23,3 +24,4 @@ venv/
 pytest-report.xml
 .coverage
 .python-version
 data
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -15,6 +15,18 @@ repos:
        args: ['--maxkb=1000']
    -   id: end-of-file-fixer
        exclude: '^(.*\.svg)$'
    -   id: no-commit-to-branch
    -   id: check-yaml
        args: ["--unsafe"]
    -   id: detect-private-key
    -   id: requirements-txt-fixer
    -   id: mixed-line-ending
        args: [--fix=lf] # Forces to replace line ending by LF (line feed)
    -   id: check-executables-have-shebangs
    -   id: check-json
    -   id: check-shebang-scripts-are-executable
    -   id: check-symlinks
    -   id: check-toml
 -   repo: https://github.com/Lucas-C/pre-commit-hooks
    rev: v1.5.4
@ -41,7 +53,7 @@ repos:
        - black==24.3.0
 -   repo: https://github.com/astral-sh/uv-pre-commit
-    rev: 0.6.3
+    rev: 0.7.8
    hooks:
    -   id: uv-lock
    -   id: uv-export
@ -49,6 +61,7 @@ repos:
            "--frozen",
            "--no-hashes",
            "--no-emit-project",
            "--no-default-groups",
            "--output-file=requirements.txt"
        ]
@ -76,24 +89,29 @@ repos:
      - id: distro-codegen
        name: Distribution Template Codegen
        additional_dependencies:
-          - uv==0.6.0
+          - uv==0.7.8
-        entry: uv run --extra codegen ./scripts/distro_codegen.py
+        entry: uv run --group codegen ./scripts/distro_codegen.py
        language: python
        pass_filenames: false
        require_serial: true
        files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
 -   repo: local
    hooks:
      - id: openapi-codegen
        name: API Spec Codegen
        additional_dependencies:
-          - uv==0.6.2
+          - uv==0.7.8
-        entry: sh -c 'uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
+        entry: sh -c 'uv run ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
        language: python
        pass_filenames: false
        require_serial: true
        files: ^llama_stack/apis/|^docs/openapi_generator/
      - id: check-workflows-use-hashes
        name: Check GitHub Actions use SHA-pinned actions
        entry: ./scripts/check-workflows-use-hashes.sh
        language: system
        pass_filenames: false
        require_serial: true
        always_run: true
        files: ^\.github/workflows/.*\.ya?ml$
 ci:
    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -5,28 +5,21 @@
 # Required
 version: 2
 # Build documentation in the "docs/" directory with Sphinx
 sphinx:
  configuration: docs/source/conf.py
 # Set the OS, Python version and other tools you might need
 build:
  os: ubuntu-22.04
  tools:
    python: "3.12"
-    # You can also specify other tool versions:
+  jobs:
-    # nodejs: "19"
+    pre_create_environment:
-    # rust: "1.64"
+      - asdf plugin add uv
-    # golang: "1.19"
+      - asdf install uv latest
-
+      - asdf global uv latest
-# Build documentation in the "docs/" directory with Sphinx
+    create_environment:
-sphinx:
+      - uv venv "${READTHEDOCS_VIRTUALENV_PATH}"
-  configuration: docs/source/conf.py
+    install:
-
+      - UV_PROJECT_ENVIRONMENT="${READTHEDOCS_VIRTUALENV_PATH}" uv sync --frozen --group docs
 # Optionally build your docs in additional formats such as PDF and ePub
 # formats:
 #    - pdf
 #    - epub
 # Optional but recommended, declare the Python requirements required
 # to build your documentation
 # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
 python:
   install:
   - requirements: docs/requirements.txt
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,75 @@
 # Changelog
 # v0.2.7
 Published on: 2025-05-16T20:38:10Z
 ## Highlights
 This is a small update. But a couple highlights:
 * feat: function tools in OpenAI Responses by @bbrowning in https://github.com/meta-llama/llama-stack/pull/2094, getting closer to ready. Streaming is the next missing piece.
 * feat: Adding support for customizing chunk context in RAG insertion and querying by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/2134
 * feat: scaffolding for Llama Stack UI by @ehhuang in https://github.com/meta-llama/llama-stack/pull/2149, more to come in the coming releases.
 ---
 # v0.2.6
 Published on: 2025-05-12T18:06:52Z
 ---
 # v0.2.5
 Published on: 2025-05-04T20:16:49Z
 ---
 # v0.2.4
 Published on: 2025-04-29T17:26:01Z
 ## Highlights
 * One-liner to install and run Llama Stack yay! by @reluctantfuturist in https://github.com/meta-llama/llama-stack/pull/1383
 * support for NVIDIA NeMo datastore by @raspawar in https://github.com/meta-llama/llama-stack/pull/1852
 * (yuge!) Kubernetes authentication by @leseb in https://github.com/meta-llama/llama-stack/pull/1778
 * (yuge!) OpenAI Responses API by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1989
 * add api.llama provider, llama-guard-4 model by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2058
 ---
 # v0.2.3
 Published on: 2025-04-25T22:46:21Z
 ## Highlights
 * OpenAI compatible inference endpoints and client-SDK support. `client.chat.completions.create()` now works.
 * significant improvements and functionality added to the nVIDIA distribution
 * many improvements to the test verification suite.
 * new inference providers: Ramalama, IBM WatsonX
 * many improvements to the Playground UI
 ---
 # v0.2.2
 Published on: 2025-04-13T01:19:49Z
 ## Main changes
 - Bring Your Own Provider (@leseb) - use out-of-tree provider code to execute the distribution server
 - OpenAI compatible inference API in progress (@bbrowning)
 - Provider verifications (@ehhuang)
 - Many updates and fixes to playground
 - Several llama4 related fixes
 ---
 # v0.2.1
 Published on: 2025-04-05T23:13:00Z
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -110,25 +110,9 @@ uv run pre-commit run --all-files
 > [!CAUTION]
 > Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
-## Running unit tests
+## Running tests
-You can run the unit tests by running:
+You can find the Llama Stack testing documentation here [here](tests/README.md).
 ```bash
 source .venv/bin/activate
 ./scripts/unit-tests.sh
 ```
 If you'd like to run for a non-default version of Python (currently 3.10), pass `PYTHON_VERSION` variable as follows:
 ```
 source .venv/bin/activate
 PYTHON_VERSION=3.13 ./scripts/unit-tests.sh
 ```
 ## Running integration tests
 You can run integration tests following the instructions [here](tests/integration/README.md).
 ## Adding a new dependency to the project
@ -141,11 +125,20 @@ uv sync
 ## Coding Style
-* Comments should provide meaningful insights into the code. Avoid filler comments that simply describe the next step, as they create unnecessary clutter, same goes for docstrings.
+* Comments should provide meaningful insights into the code. Avoid filler comments that simply
-* Prefer comments to clarify surprising behavior and/or relationships between parts of the code rather than explain what the next line of code does.
+  describe the next step, as they create unnecessary clutter, same goes for docstrings.
-* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like `Exception`.
+* Prefer comments to clarify surprising behavior and/or relationships between parts of the code
  rather than explain what the next line of code does.
 * Catching exceptions, prefer using a specific exception type rather than a broad catch-all like
  `Exception`.
 * Error messages should be prefixed with "Failed to ..."
-* 4 spaces for indentation rather than tabs
+* 4 spaces for indentation rather than tab
 * When using `# noqa` to suppress a style or linter warning, include a comment explaining the
  justification for bypassing the check.
 * When using `# type: ignore` to suppress a mypy warning, include a comment explaining the
  justification for bypassing the check.
 * Don't use unicode characters in the codebase. ASCII-only is preferred for compatibility or
  readability reasons.
 ## Common Tasks
@ -174,14 +167,11 @@ If you have made changes to a provider's configuration in any form (introducing
 If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.
 ```bash
 cd docs
 uv sync --extra docs
 # This rebuilds the documentation pages.
-uv run make html
+uv run --group docs make -C docs/ html
 # This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
-uv run sphinx-autobuild source build/html --write-all
+uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
 ```
 ### Update API Documentation
@ -189,7 +179,7 @@ uv run sphinx-autobuild source build/html --write-all
 If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command:
 ```bash
-uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh
+uv run ./docs/openapi_generator/run_openapi_generator.sh
 ```
 The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,5 +1,4 @@
 include pyproject.toml
 include llama_stack/templates/dependencies.json
 include llama_stack/models/llama/llama3/tokenizer.model
 include llama_stack/models/llama/llama4/tokenizer.model
 include llama_stack/distribution/*.sh
--- a/README.md
+++ b/README.md
@ -7,17 +7,18 @@
 [![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
 [![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)
-[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb)
+[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
 ### ✨🎉 Llama 4 Support  🎉✨
 We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
-You can now run Llama 4 models on Llama Stack.
+<details>
 <summary>👋 Click here to see how to run Llama 4 models on Llama Stack </summary>
 \
 *Note you need 8xH100 GPU-host to run these models*
 ```bash
 pip install -U llama_stack
@ -67,6 +68,16 @@ print(f"Assistant> {response.completion_message.content}")
 As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!
 </details>
 ### 🚀 One-Line Installer 🚀
 To try Llama Stack locally, run:
 ```bash
 curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/install.sh | sh
 ```
 ### Overview
 Llama Stack standardizes the core building blocks that simplify AI application development. It codifies best practices across the Llama ecosystem. More specifically, it provides
@ -96,25 +107,29 @@ By reducing friction and complexity, Llama Stack empowers developers to focus on
 ### API Providers
 Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack.
-| **API Provider Builder** |    **Environments**    | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
+| **API Provider Builder** |    **Environments**    | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | **Post Training** |
-|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|
+|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|:-----------------:|
-|      Meta Reference      |      Single Node       |     ✅      |       ✅       |     ✅      |     ✅      |       ✅       |
+|      Meta Reference      |      Single Node       |     ✅      |       ✅       |     ✅      |     ✅      |       ✅       |               |
-|        SambaNova         |         Hosted         |            |       ✅       |            |            |               |
+|        SambaNova         |         Hosted         |            |       ✅       |            |     ✅      |               |                  |
-|         Cerebras         |         Hosted         |            |       ✅       |            |            |               |
+|         Cerebras         |         Hosted         |            |       ✅       |            |            |               |                  |
-|        Fireworks         |         Hosted         |     ✅      |       ✅       |     ✅      |            |               |
+|        Fireworks         |         Hosted         |     ✅      |       ✅       |     ✅      |            |               |                |
-|       AWS Bedrock        |         Hosted         |            |       ✅       |            |     ✅      |               |
+|       AWS Bedrock        |         Hosted         |            |       ✅       |            |     ✅      |               |                |
-|         Together         |         Hosted         |     ✅      |       ✅       |            |     ✅      |               |
+|         Together         |         Hosted         |     ✅      |       ✅       |            |     ✅      |               |                |
-|           Groq           |         Hosted         |            |       ✅       |            |            |               |
+|           Groq           |         Hosted         |            |       ✅       |            |            |               |                 |
-|          Ollama          |      Single Node       |            |       ✅       |            |            |               |
+|          Ollama          |      Single Node       |            |       ✅       |            |            |               |                 |
-|           TGI            | Hosted and Single Node |            |       ✅       |            |            |               |
+|           TGI            | Hosted and Single Node |            |       ✅       |            |            |               |                 |
-|        NVIDIA NIM        | Hosted and Single Node |            |       ✅       |            |            |               |
+|        NVIDIA NIM        | Hosted and Single Node |            |       ✅       |            |            |               |                 |
-|          Chroma          |      Single Node       |            |               |     ✅      |            |               |
+|          Chroma          |      Single Node       |            |               |     ✅      |            |               |                 |
-|        PG Vector         |      Single Node       |            |               |     ✅      |            |               |
+|        PG Vector         |      Single Node       |            |               |     ✅      |            |               |                 |
-|    PyTorch ExecuTorch    |     On-device iOS      |     ✅      |       ✅       |            |            |               |
+|    PyTorch ExecuTorch    |     On-device iOS      |     ✅      |       ✅       |            |            |               |                |
-|           vLLM           | Hosted and Single Node |            |       ✅       |            |            |               |
+|           vLLM           | Hosted and Single Node |            |       ✅       |            |            |               |                 |
-|          OpenAI          |         Hosted         |            |       ✅       |            |            |               |
+|          OpenAI          |         Hosted         |            |       ✅       |            |            |               |                 |
-|        Anthropic         |         Hosted         |            |       ✅       |            |            |               |
+|        Anthropic         |         Hosted         |            |       ✅       |            |            |               |                 |
-|          Gemini          |         Hosted         |            |       ✅       |            |            |               |
+|          Gemini          |         Hosted         |            |       ✅       |            |            |               |                 |
 |          watsonx         |         Hosted         |            |       ✅       |            |            |               |                 |
 |        HuggingFace       |       Single Node      |            |                |            |            |               |       ✅        |
 |         TorchTune        |       Single Node      |            |                |            |            |               |       ✅        |
 |       NVIDIA NEMO        |         Hosted         |            |                |            |            |               |       ✅        |
 ### Distributions
@ -124,7 +139,6 @@ A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider
 |               **Distribution**                |                                                                    **Llama Stack Docker**                                                                     |                                                 Start This Distribution                                                  |
 |:---------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------:|
 |                Meta Reference                 |           [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general)           |      [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html)      |
 |           Meta Reference Quantized            | [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-quantized-gpu.html) |
 |                   SambaNova                   |                     [llamastack/distribution-sambanova](https://hub.docker.com/repository/docker/llamastack/distribution-sambanova/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/sambanova.html)   |
 |                   Cerebras                    |                     [llamastack/distribution-cerebras](https://hub.docker.com/repository/docker/llamastack/distribution-cerebras/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/cerebras.html)   |
 |                    Ollama                     |                       [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general)                       |            [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html)            |
--- a/docs/_static/css/my_theme.css
+++ b/docs/_static/css/my_theme.css
@ -27,3 +27,9 @@ pre {
    white-space: pre-wrap !important;
    word-break: break-all;
 }
 [data-theme="dark"] .mermaid {
    background-color: #f4f4f6 !important;
    border-radius: 6px;
    padding: 0.5em;
  }
--- a/docs/_static/js/detect_theme.js
+++ b/docs/_static/js/detect_theme.js
@ -1,9 +1,32 @@
 document.addEventListener("DOMContentLoaded", function () {
  const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
  const htmlElement = document.documentElement;
-  if (prefersDark) {
+
-    htmlElement.setAttribute("data-theme", "dark");
+  // Check if theme is saved in localStorage
  const savedTheme = localStorage.getItem("sphinx-rtd-theme");
  if (savedTheme) {
    // Use the saved theme preference
    htmlElement.setAttribute("data-theme", savedTheme);
    document.body.classList.toggle("dark", savedTheme === "dark");
  } else {
-    htmlElement.setAttribute("data-theme", "light");
+    // Fall back to system preference
    const theme = prefersDark ? "dark" : "light";
    htmlElement.setAttribute("data-theme", theme);
    document.body.classList.toggle("dark", theme === "dark");
    // Save initial preference
    localStorage.setItem("sphinx-rtd-theme", theme);
  }
  // Listen for theme changes from the existing toggle
  const observer = new MutationObserver(function(mutations) {
    mutations.forEach(function(mutation) {
      if (mutation.attributeName === "data-theme") {
        const currentTheme = htmlElement.getAttribute("data-theme");
        localStorage.setItem("sphinx-rtd-theme", currentTheme);
      }
    });
  });
  observer.observe(htmlElement, { attributes: true });
 });
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@ -1050,8 +1050,6 @@
            "text/html": [
              "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">ToolGroup</span><span style=\"font-weight: bold\">(</span>\n",
              "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"color: #808000; text-decoration-color: #808000\">identifier</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'builtin::code_interpreter'</span>,\n",
              "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"color: #808000; text-decoration-color: #808000\">provider_id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'code-interpreter'</span>,\n",
              "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"color: #808000; text-decoration-color: #808000\">provider_resource_id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'builtin::code_interpreter'</span>,\n",
              "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"color: #808000; text-decoration-color: #808000\">type</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'tool_group'</span>,\n",
              "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"color: #808000; text-decoration-color: #808000\">args</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
              "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"color: #808000; text-decoration-color: #808000\">mcp_endpoint</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>\n",
@ -1061,7 +1059,6 @@
            "text/plain": [
              "\u001b[1;35mToolGroup\u001b[0m\u001b[1m(\u001b[0m\n",
              "\u001b[2;32m│   \u001b[0m\u001b[33midentifier\u001b[0m=\u001b[32m'builtin::code_interpreter'\u001b[0m,\n",
              "\u001b[2;32m│   \u001b[0m\u001b[33mprovider_id\u001b[0m=\u001b[32m'code-interpreter'\u001b[0m,\n",
              "\u001b[2;32m│   \u001b[0m\u001b[33mprovider_resource_id\u001b[0m=\u001b[32m'builtin::code_interpreter'\u001b[0m,\n",
              "\u001b[2;32m│   \u001b[0m\u001b[33mtype\u001b[0m=\u001b[32m'tool_group'\u001b[0m,\n",
              "\u001b[2;32m│   \u001b[0m\u001b[33margs\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
--- a/docs/getting_started_llama_api.ipynb
+++ b/docs/getting_started_llama_api.ipynb
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
--- a/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
+++ b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
@ -840,7 +840,6 @@
    "    \"memory_optimizations.rst\",\n",
    "    \"chat.rst\",\n",
    "    \"llama3.rst\",\n",
    "    \"datasets.rst\",\n",
    "    \"qat_finetune.rst\",\n",
    "    \"lora_finetune.rst\",\n",
    "]\n",
@ -1586,7 +1585,6 @@
    "    \"memory_optimizations.rst\",\n",
    "    \"chat.rst\",\n",
    "    \"llama3.rst\",\n",
    "    \"datasets.rst\",\n",
    "    \"qat_finetune.rst\",\n",
    "    \"lora_finetune.rst\",\n",
    "]\n",
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@ -44,7 +44,7 @@ def main(output_dir: str):
    if return_type_errors:
        print("\nAPI Method Return Type Validation Errors:\n")
        for error in return_type_errors:
-            print(error)
+            print(error, file=sys.stderr)
        sys.exit(1)
    now = str(datetime.now())
    print(
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -6,6 +6,7 @@
 import hashlib
 import ipaddress
 import types
 import typing
 from dataclasses import make_dataclass
 from typing import Any, Dict, Set, Union
@ -179,7 +180,7 @@ class ContentBuilder:
        "Creates the content subtree for a request or response."
        def is_iterator_type(t):
-            return "StreamChunk" in str(t)
+            return "StreamChunk" in str(t) or "OpenAIResponseObjectStream" in str(t)
        def get_media_type(t):
            if is_generic_list(t):
@ -189,7 +190,7 @@ class ContentBuilder:
            else:
                return "application/json"
-        if typing.get_origin(payload_type) is typing.Union:
+        if typing.get_origin(payload_type) in (typing.Union, types.UnionType):
            media_types = []
            item_types = []
            for x in typing.get_args(payload_type):
@ -758,7 +759,7 @@ class Generator:
        )
        return Operation(
-            tags=[op.defining_class.__name__],
+            tags=[getattr(op.defining_class, "API_NAMESPACE", op.defining_class.__name__)],
            summary=None,
            # summary=doc_string.short_description,
            description=description,
@ -804,6 +805,8 @@ class Generator:
        operation_tags: List[Tag] = []
        for cls in endpoint_classes:
            doc_string = parse_type(cls)
            if hasattr(cls, "API_NAMESPACE") and cls.API_NAMESPACE != cls.__name__:
                continue
            operation_tags.append(
                Tag(
                    name=cls.__name__,
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@ -174,14 +174,64 @@ def _validate_list_parameters_contain_data(method) -> str | None:
        return "does not have a mandatory data attribute containing the list of objects"
 def _validate_has_ellipsis(method) -> str | None:
    source = inspect.getsource(method)
    if "..." not in source and not "NotImplementedError" in source:
        return "does not contain ellipsis (...) in its implementation"
 def _validate_has_return_in_docstring(method) -> str | None:
    source = inspect.getsource(method)
    return_type = method.__annotations__.get('return')
    if return_type is not None and return_type != type(None) and ":returns:" not in source:
        return "does not have a ':returns:' in its docstring"
 def _validate_has_params_in_docstring(method) -> str | None:
    source = inspect.getsource(method)
    sig = inspect.signature(method)
    # Only check if the method has more than one parameter
    if len(sig.parameters) > 1 and ":param" not in source:
        return "does not have a ':param' in its docstring"
 def _validate_has_no_return_none_in_docstring(method) -> str | None:
    source = inspect.getsource(method)
    return_type = method.__annotations__.get('return')
    if return_type is None and ":returns: None" in source:
        return "has a ':returns: None' in its docstring which is redundant for None-returning functions"
 def _validate_docstring_lines_end_with_dot(method) -> str | None:
    docstring = inspect.getdoc(method)
    if docstring is None:
        return None
    lines = docstring.split('\n')
    for line in lines:
        line = line.strip()
        if line and not any(line.endswith(char) for char in '.:{}[]()",'):
            return f"docstring line '{line}' does not end with a valid character: . : {{ }} [ ] ( ) , \""
 _VALIDATORS = {
    "GET": [
        _validate_api_method_return_type,
        _validate_list_parameters_contain_data,
        _validate_api_method_doesnt_return_list,
        _validate_has_ellipsis,
        _validate_has_return_in_docstring,
        _validate_has_params_in_docstring,
        _validate_docstring_lines_end_with_dot,
    ],
    "DELETE": [
        _validate_api_delete_method_returns_none,
        _validate_has_ellipsis,
        _validate_has_return_in_docstring,
        _validate_has_params_in_docstring,
        _validate_has_no_return_none_in_docstring
    ],
    "POST": [
        _validate_has_ellipsis,
        _validate_has_return_in_docstring,
        _validate_has_params_in_docstring,
        _validate_has_no_return_none_in_docstring,
        _validate_docstring_lines_end_with_dot,
    ],
 }
--- a/docs/readme.md
+++ b/docs/readme.md
@ -3,10 +3,10 @@
 Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [ReadTheDocs page](https://llama-stack.readthedocs.io/en/latest/index.html).
 ## Render locally
 From the llama-stack root directory, run the following command to render the docs locally:
 ```bash
-pip install -r requirements.txt
+uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
 cd docs
 python -m sphinx_autobuild source _build
 ```
 You can open up the docs in your browser at http://localhost:8000
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1,16 +0,0 @@
 sphinx==8.1.3
 myst-parser
 linkify
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 sphinx-rtd-theme>=1.0.0
 sphinx_autobuild
 sphinx-copybutton
 sphinx-design
 sphinx-pdj-theme
 sphinx_rtd_dark_mode
 sphinx-tabs
 sphinxcontrib-openapi
 sphinxcontrib-redoc
 sphinxcontrib-mermaid
 sphinxcontrib-video
 tomli
--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@ -51,11 +51,37 @@ chunks = [
        "mime_type": "text/plain",
        "metadata": {
            "document_id": "doc1",
            "author": "Jane Doe",
        },
    },
 ]
 client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
 ```
 #### Using Precomputed Embeddings
 If you decide to precompute embeddings for your documents, you can insert them directly into the vector database by
 including the embedding vectors in the chunk data. This is useful if you have a separate embedding service or if you
 want to customize the ingestion process.
 ```python
 chunks_with_embeddings = [
    {
        "content": "First chunk of text",
        "mime_type": "text/plain",
        "embedding": [0.1, 0.2, 0.3, ...],  # Your precomputed embedding vector
        "metadata": {"document_id": "doc1", "section": "introduction"},
    },
    {
        "content": "Second chunk of text",
        "mime_type": "text/plain",
        "embedding": [0.2, 0.3, 0.4, ...],  # Your precomputed embedding vector
        "metadata": {"document_id": "doc1", "section": "methodology"},
    },
 ]
 client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks_with_embeddings)
 ```
 When providing precomputed embeddings, ensure the embedding dimension matches the embedding_dimension specified when
 registering the vector database.
 ### Retrieval
 You can query the vector database to retrieve documents based on their embeddings.
 ```python
@ -68,7 +94,8 @@ chunks_response = client.vector_io.query(
 ### Using the RAG Tool
 A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc.
-and automatically chunks them into smaller pieces.
+and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the
 [appendix](#more-ragdocument-examples).
 ```python
 from llama_stack_client import RAGDocument
@ -97,6 +124,17 @@ results = client.tool_runtime.rag_tool.query(
 )
 ```
 You can configure how the RAG tool adds metadata to the context if you find it useful for your application. Simply add:
 ```python
 # Query documents
 results = client.tool_runtime.rag_tool.query(
    vector_db_ids=[vector_db_id],
    content="What do you know about...",
    query_config={
        "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
    },
 )
 ```
 ### Building RAG-Enhanced Agents
 One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
@ -114,6 +152,12 @@ agent = Agent(
            "name": "builtin::rag/knowledge_search",
            "args": {
                "vector_db_ids": [vector_db_id],
                # Defaults
                "query_config": {
                    "chunk_size_in_tokens": 512,
                    "chunk_overlap_in_tokens": 0,
                    "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
                },
            },
        }
    ],
@ -178,3 +222,38 @@ for vector_db_id in client.vector_dbs.list():
    print(f"Unregistering vector database: {vector_db_id.identifier}")
    client.vector_dbs.unregister(vector_db_id=vector_db_id.identifier)
 ```
 ### Appendix
 #### More RAGDocument Examples
 ```python
 from llama_stack_client import RAGDocument
 import base64
 RAGDocument(document_id="num-0", content={"uri": "file://path/to/file"})
 RAGDocument(document_id="num-1", content="plain text")
 RAGDocument(
    document_id="num-2",
    content={
        "type": "text",
        "text": "plain text input",
    },  # for inputs that should be treated as text explicitly
 )
 RAGDocument(
    document_id="num-3",
    content={
        "type": "image",
        "image": {"url": {"uri": "https://mywebsite.com/image.jpg"}},
    },
 )
 B64_ENCODED_IMAGE = base64.b64encode(
    requests.get(
        "https://raw.githubusercontent.com/meta-llama/llama-stack/refs/heads/main/docs/_static/llama-stack.png"
    ).content
 )
 RAGDocuemnt(
    document_id="num-4",
    content={"type": "image", "image": {"data": B64_ENCODED_IMAGE}},
 )
 ```
 for more strongly typed interaction use the typed dicts found [here](https://github.com/meta-llama/llama-stack-client-python/blob/38cd91c9e396f2be0bec1ee96a19771582ba6f17/src/llama_stack_client/types/shared_params/document.py).
--- a/docs/source/building_applications/tools.md
+++ b/docs/source/building_applications/tools.md
@ -41,30 +41,9 @@ client.toolgroups.register(
 The tool requires an API key which can be provided either in the configuration or through the request header `X-LlamaStack-Provider-Data`. The format of the header is `{"<provider_name>_api_key": <your api key>}`.
 > **NOTE:** When using Tavily Search and Bing Search, the inference output will still display "Brave Search." This is because Llama models have been trained with Brave Search as a built-in tool. Tavily and bing is just being used in lieu of Brave search.
 #### Code Interpreter
 The Code Interpreter allows execution of Python code within a controlled environment.
 ```python
 # Register Code Interpreter tool group
 client.toolgroups.register(
    toolgroup_id="builtin::code_interpreter", provider_id="code_interpreter"
 )
 ```
 Features:
 - Secure execution environment using `bwrap` sandboxing
 - Matplotlib support for generating plots
 - Disabled dangerous system operations
 - Configurable execution timeouts
 > ⚠️ Important: The code interpreter tool can operate in a controlled environment locally or on Podman containers. To ensure proper functionality in containerized environments:
 > - The container requires privileged access (e.g., --privileged).
 > - Users without sufficient permissions may encounter permission errors. (`bwrap: Can't mount devpts on /newroot/dev/pts: Permission denied`)
 > - 🔒 Security Warning: Privileged mode grants elevated access and bypasses security restrictions. Use only in local, isolated, or controlled environments.
 #### WolframAlpha
 The WolframAlpha tool provides access to computational knowledge through the WolframAlpha API.
@ -102,7 +81,7 @@ Features:
 - Context retrieval with token limits
-> **Note:** By default, llama stack run.yaml defines toolgroups for web search, code interpreter and rag, that are provided by tavily-search, code-interpreter and rag providers.
+> **Note:** By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
 ## Model Context Protocol (MCP) Tools
@ -186,31 +165,69 @@ all_tools = client.tools.list_tools()
 group_tools = client.tools.list_tools(toolgroup_id="search_tools")
 ```
-## Simple Example: Using an Agent with the Code-Interpreter Tool
+## Simple Example 2: Using an Agent with the Web Search Tool
-
+1. Start by registering a Tavily API key at [Tavily](https://tavily.com/).
 2. [Optional] Provide the API key directly to the Llama Stack server
 ```bash
 export TAVILY_SEARCH_API_KEY="your key"
 ```
 ```bash
 --env TAVILY_SEARCH_API_KEY=${TAVILY_SEARCH_API_KEY}
 ```
 3. Run the following script.
 ```python
-from llama_stack_client import Agent
+from llama_stack_client.lib.agents.agent import Agent
 from llama_stack_client.types.agent_create_params import AgentConfig
 from llama_stack_client.lib.agents.event_logger import EventLogger
 from llama_stack_client import LlamaStackClient
 client = LlamaStackClient(
    base_url=f"http://localhost:8321",
    provider_data={
        "tavily_search_api_key": "your_TAVILY_SEARCH_API_KEY"
    },  # Set this from the client side. No need to provide it if it has already been configured on the Llama Stack server.
 )
 # Instantiate the AI agent with the given configuration
 agent = Agent(
    client,
    name="code-interpreter",
    description="A code interpreter agent for executing Python code snippets",
    instructions="""
    You are a highly reliable, concise, and precise assistant.
    Always show the generated code, never generate your own code, and never anticipate results.
    """,
    model="meta-llama/Llama-3.2-3B-Instruct",
-    tools=["builtin::code_interpreter"],
+    instructions=(
-    max_infer_iters=5,
+        "You are a web search assistant, must use websearch tool to look up the most current and precise information available. "
    ),
    tools=["builtin::websearch"],
 )
-# Start a session
+session_id = agent.create_session("websearch-session")
 session_id = agent.create_session("tool_session")
 # Send a query to the AI agent for code execution
 response = agent.create_turn(
-    messages=[{"role": "user", "content": "Run this code: print(3 ** 4 - 5 * 2)"}],
+    messages=[
        {"role": "user", "content": "How did the USA perform in the last Olympics?"}
    ],
    session_id=session_id,
 )
 for log in EventLogger().log(response):
    log.print()
 ```
 ## Simple Example3: Using an Agent with the WolframAlpha Tool
 1. Start by registering for a WolframAlpha API key at [WolframAlpha Developer Portal](https://developer.wolframalpha.com/access).
 2. Provide the API key either when starting the Llama Stack server:
    ```bash
    --env WOLFRAM_ALPHA_API_KEY=${WOLFRAM_ALPHA_API_KEY}
    ```
    or from the client side:
    ```python
    client = LlamaStackClient(
        base_url="http://localhost:8321",
        provider_data={"wolfram_alpha_api_key": wolfram_api_key},
    )
    ```
 3. Configure the tools in the Agent by setting `tools=["builtin::wolfram_alpha"]`.
 4. Example user query:
    ```python
    response = agent.create_turn(
        messages=[{"role": "user", "content": "Solve x^2 + 2x + 1 = 0 using WolframAlpha"}],
        session_id=session_id,
    )
    ```
 ```
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -22,7 +22,11 @@ from docutils import nodes
 # Read version from pyproject.toml
 with Path(__file__).parent.parent.parent.joinpath("pyproject.toml").open("rb") as f:
    pypi_url = "https://pypi.org/pypi/llama-stack/json"
-    version_tag = json.loads(requests.get(pypi_url).text)["info"]["version"]
+    headers = {
        'User-Agent': 'pip/23.0.1 (python 3.11)',  # Mimic pip's user agent
        'Accept': 'application/json'
    }
    version_tag = json.loads(requests.get(pypi_url, headers=headers).text)["info"]["version"]
    print(f"{version_tag=}")
    # generate the full link including text and url here
@ -53,14 +57,6 @@ myst_enable_extensions = ["colon_fence"]
 html_theme = "sphinx_rtd_theme"
 html_use_relative_paths = True
 # html_theme = "sphinx_pdj_theme"
 # html_theme_path = [sphinx_pdj_theme.get_html_theme_path()]
 # html_theme = "pytorch_sphinx_theme"
 # html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
 templates_path = ["_templates"]
 exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
@ -110,6 +106,8 @@ html_theme_options = {
    "canonical_url": "https://github.com/meta-llama/llama-stack",
    "collapse_navigation": False,
    # "style_nav_header_background": "#c3c9d4",
    'display_version': True,
    'version_selector': True,
 }
 default_dark_mode = False
--- a/docs/source/contributing/new_api_provider.md
+++ b/docs/source/contributing/new_api_provider.md
@ -6,7 +6,7 @@ This guide will walk you through the process of adding a new API provider to Lla
 - Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
 - Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
 - Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
- Update any distribution {repopath}`Templates::llama_stack/templates/` build.yaml and run.yaml files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
+- Update any distribution {repopath}`Templates::llama_stack/templates/` `build.yaml` and `run.yaml` files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
 Here are some example PRs to help you get started:
@ -33,6 +33,7 @@ Note that each provider's `sample_run_config()` method (in the configuration cla
 Unit tests are located in {repopath}`tests/unit`. Provider-specific unit tests are located in {repopath}`tests/unit/providers`. These tests are all run automatically as part of the CI process.
 Consult {repopath}`tests/unit/README.md` for more details on how to run the tests manually.
 ### 3. Additional end-to-end testing
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -109,8 +109,6 @@ llama stack build --list-templates
 +------------------------------+-----------------------------------------------------------------------------+
 | nvidia                       | Use NVIDIA NIM for running LLM inference                                    |
 +------------------------------+-----------------------------------------------------------------------------+
 | meta-reference-quantized-gpu | Use Meta Reference with fp8, int4 quantization for running LLM inference    |
 +------------------------------+-----------------------------------------------------------------------------+
 | cerebras                     | Use Cerebras for running LLM inference                                      |
 +------------------------------+-----------------------------------------------------------------------------+
 | ollama                       | Use (an external) Ollama server for running LLM inference                   |
@ -176,7 +174,11 @@ distribution_spec:
    safety: inline::llama-guard
    agents: inline::meta-reference
    telemetry: inline::meta-reference
 image_name: ollama
 image_type: conda
 # If some providers are external, you can specify the path to the implementation
 external_providers_dir: ~/.llama/providers.d
 ```
 ```
@ -184,6 +186,57 @@ llama stack build --config llama_stack/templates/ollama/build.yaml
 ```
 :::
 :::{tab-item} Building with External Providers
 Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently or use community-provided providers.
 To build a distribution with external providers, you need to:
 1. Configure the `external_providers_dir` in your build configuration file:
 ```yaml
 # Example my-external-stack.yaml with external providers
 version: '2'
 distribution_spec:
  description: Custom distro for CI tests
  providers:
    inference:
    - remote::custom_ollama
 # Add more providers as needed
 image_type: container
 image_name: ci-test
 # Path to external provider implementations
 external_providers_dir: ~/.llama/providers.d
 ```
 Here's an example for a custom Ollama provider:
 ```yaml
 adapter:
  adapter_type: custom_ollama
  pip_packages:
  - ollama
  - aiohttp
  - llama-stack-provider-ollama # This is the provider package
  config_class: llama_stack_ollama_provider.config.OllamaImplConfig
  module: llama_stack_ollama_provider
 api_dependencies: []
 optional_api_dependencies: []
 ```
 The `pip_packages` section lists the Python packages required by the provider, as well as the
 provider package itself. The package must be available on PyPI or can be provided from a local
 directory or a git repository (git must be installed on the build environment).
 2. Build your distribution using the config file:
 ```
 llama stack build --config my-external-stack.yaml
 ```
 For more information on external providers, including directory structure, provider types, and implementation requirements, see the [External Providers documentation](../providers/external.md).
 :::
 :::{tab-item} Building Container
 ```{admonition} Podman Alternative
@ -218,7 +271,7 @@ Now, let's start the Llama Stack Distribution Server. You will need the YAML con
 ```
 llama stack run -h
-usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--disable-ipv6] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] [--tls-certfile TLS_CERTFILE]
+usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] [--tls-certfile TLS_CERTFILE]
                       [--image-type {conda,container,venv}]
                       config
@ -232,7 +285,6 @@ options:
  --port PORT           Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
  --image-name IMAGE_NAME
                        Name of the image to run. Defaults to the current environment (default: None)
  --disable-ipv6        Disable IPv6 support (default: False)
  --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: [])
  --tls-keyfile TLS_KEYFILE
                        Path to TLS key file for HTTPS (default: None)
@ -286,6 +338,48 @@ INFO:     Application startup complete.
 INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
 INFO:     2401:db00:35c:2d2b:face:0:c9:0:54678 - "GET /models/list HTTP/1.1" 200 OK
 ```
 ### Listing Distributions
 Using the list command, you can view all existing Llama Stack distributions, including stacks built from templates, from scratch, or using custom configuration files.
 ```
 llama stack list -h
 usage: llama stack list [-h]
 list the build stacks
 options:
  -h, --help  show this help message and exit
 ```
 Example Usage
 ```
 llama stack list
 ```
 ### Removing a Distribution
 Use the remove command to delete a distribution you've previously built.
 ```
 llama stack rm -h
 usage: llama stack rm [-h] [--all] [name]
 Remove the build stack
 positional arguments:
  name        Name of the stack to delete (default: None)
 options:
  -h, --help  show this help message and exit
  --all, -a   Delete all stacks (use with caution) (default: False)
 ```
 Example
 ```
 llama stack rm llamastack-test
 ```
 To keep your environment organized and avoid clutter, consider using `llama stack list` to review old or unused distributions and `llama stack rm <name>` to delete them when they’re no longer needed.
 ### Troubleshooting
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@ -53,6 +53,13 @@ models:
  provider_id: ollama
  provider_model_id: null
 shields: []
 server:
  port: 8321
  auth:
    provider_type: "kubernetes"
    config:
      api_server_url: "https://kubernetes.default.svc"
      ca_cert_path: "/path/to/ca.crt"
 ```
 Let's break this down into the different sections. The first section specifies the set of APIs that the stack server will serve:
@ -102,6 +109,227 @@ A Model is an instance of a "Resource" (see [Concepts](../concepts/index)) and i
 What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. Contrast it with `model_id` which is the identifier for the same model for Llama Stack's purposes. For example, you may want to name "llama3.2:vision-11b" as "image_captioning_model" when you use it in your Stack interactions. When omitted, the server will set `provider_model_id` to be the same as `model_id`.
 ## Server Configuration
 The `server` section configures the HTTP server that serves the Llama Stack APIs:
 ```yaml
 server:
  port: 8321  # Port to listen on (default: 8321)
  tls_certfile: "/path/to/cert.pem"  # Optional: Path to TLS certificate for HTTPS
  tls_keyfile: "/path/to/key.pem"    # Optional: Path to TLS key for HTTPS
 ```
 ### Authentication Configuration
 The `auth` section configures authentication for the server. When configured, all API requests must include a valid Bearer token in the Authorization header:
 ```
 Authorization: Bearer <token>
 ```
 The server supports multiple authentication providers:
 #### OAuth 2.0/OpenID Connect Provider with Kubernetes
 The Kubernetes cluster must be configured to use a service account for authentication.
 ```bash
 kubectl create namespace llama-stack
 kubectl create serviceaccount llama-stack-auth -n llama-stack
 kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --serviceaccount=llama-stack:llama-stack-auth -n llama-stack
 kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
 ```
 Make sure the `kube-apiserver` runs with `--anonymous-auth=true` to allow unauthenticated requests
 and that the correct RoleBinding is created to allow the service account to access the necessary
 resources. If that is not the case, you can create a RoleBinding for the service account to access
 the necessary resources:
 ```yaml
 # allow-anonymous-openid.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
  name: allow-anonymous-openid
 rules:
 - nonResourceURLs: ["/openid/v1/jwks"]
  verbs: ["get"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  name: allow-anonymous-openid
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: allow-anonymous-openid
 subjects:
 - kind: User
  name: system:anonymous
  apiGroup: rbac.authorization.k8s.io
 ```
 And then apply the configuration:
 ```bash
 kubectl apply -f allow-anonymous-openid.yaml
 ```
 Validates tokens against the Kubernetes API server through the OIDC provider:
 ```yaml
 server:
  auth:
    provider_type: "oauth2_token"
    config:
      jwks:
        uri: "https://kubernetes.default.svc"
        key_recheck_period: 3600
      tls_cafile: "/path/to/ca.crt"
      issuer: "https://kubernetes.default.svc"
      audience: "https://kubernetes.default.svc"
 ```
 To find your cluster's audience, run:
 ```bash
 kubectl create token default --duration=1h | cut -d. -f2 | base64 -d | jq .aud
 ```
 For the issuer, you can use the OIDC provider's URL:
 ```bash
 kubectl get --raw /.well-known/openid-configuration| jq .issuer
 ```
 For the tls_cafile, you can use the CA certificate of the OIDC provider:
 ```bash
 kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}'
 ```
 The provider extracts user information from the JWT token:
 - Username from the `sub` claim becomes a role
 - Kubernetes groups become teams
 You can easily validate a request by running:
 ```bash
 curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers
 ```
 #### Custom Provider
 Validates tokens against a custom authentication endpoint:
 ```yaml
 server:
  auth:
    provider_type: "custom"
    config:
      endpoint: "https://auth.example.com/validate"  # URL of the auth endpoint
 ```
 The custom endpoint receives a POST request with:
 ```json
 {
  "api_key": "<token>",
  "request": {
    "path": "/api/v1/endpoint",
    "headers": {
      "content-type": "application/json",
      "user-agent": "curl/7.64.1"
    },
    "params": {
      "key": ["value"]
    }
  }
 }
 ```
 And must respond with:
 ```json
 {
  "access_attributes": {
    "roles": ["admin", "user"],
    "teams": ["ml-team", "nlp-team"],
    "projects": ["llama-3", "project-x"],
    "namespaces": ["research"]
  },
  "message": "Authentication successful"
 }
 ```
 If no access attributes are returned, the token is used as a namespace.
 ### Quota Configuration
 The `quota` section allows you to enable server-side request throttling for both
 authenticated and anonymous clients. This is useful for preventing abuse, enforcing
 fairness across tenants, and controlling infrastructure costs without requiring
 client-side rate limiting or external proxies.
 Quotas are disabled by default. When enabled, each client is tracked using either:
 * Their authenticated `client_id` (derived from the Bearer token), or
 * Their IP address (fallback for anonymous requests)
 Quota state is stored in a SQLite-backed key-value store, and rate limits are applied
 within a configurable time window (currently only `day` is supported).
 #### Example
 ```yaml
 server:
  quota:
    kvstore:
      type: sqlite
      db_path: ./quotas.db
    anonymous_max_requests: 100
    authenticated_max_requests: 1000
    period: day
 ```
 #### Configuration Options
 | Field                        | Description                                                                |
 | ---------------------------- | -------------------------------------------------------------------------- |
 | `kvstore`                    | Required. Backend storage config for tracking request counts.              |
 | `kvstore.type`               | Must be `"sqlite"` for now. Other backends may be supported in the future. |
 | `kvstore.db_path`            | File path to the SQLite database.                                          |
 | `anonymous_max_requests`     | Max requests per period for unauthenticated clients.                       |
 | `authenticated_max_requests` | Max requests per period for authenticated clients.                         |
 | `period`                     | Time window for quota enforcement. Only `"day"` is supported.              |
 > Note: if `authenticated_max_requests` is set but no authentication provider is
 configured, the server will fall back to applying `anonymous_max_requests` to all
 clients.
 #### Example with Authentication Enabled
 ```yaml
 server:
  port: 8321
  auth:
    provider_type: custom
    config:
      endpoint: https://auth.example.com/validate
  quota:
    kvstore:
      type: sqlite
      db_path: ./quotas.db
    anonymous_max_requests: 100
    authenticated_max_requests: 1000
    period: day
 ```
 If a client exceeds their limit, the server responds with:
 ```http
 HTTP/1.1 429 Too Many Requests
 Content-Type: application/json
 {
  "error": {
    "message": "Quota exceeded"
  }
 }
 ```
 ## Extending to handle Safety
 Configuring Safety can be a little involved so it is instructive to go through an example.
--- a/docs/source/distributions/kubernetes_deployment.md
+++ b/docs/source/distributions/kubernetes_deployment.md
@ -172,7 +172,7 @@ spec:
      - name: llama-stack
        image: localhost/llama-stack-run-k8s:latest
        imagePullPolicy: IfNotPresent
-        command: ["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]
+        command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/app/config.yaml"]
        ports:
          - containerPort: 5000
        volumeMounts:
--- a/docs/source/distributions/ondevice_distro/android_sdk.md
+++ b/docs/source/distributions/ondevice_distro/android_sdk.md
@ -24,7 +24,7 @@ The key files in the app are `ExampleLlamaStackLocalInference.kt`, `ExampleLlama
 Add the following dependency in your `build.gradle.kts` file:
 ```
 dependencies {
- implementation("com.llama.llamastack:llama-stack-client-kotlin:0.1.4.2")
+ implementation("com.llama.llamastack:llama-stack-client-kotlin:0.2.2")
 }
 ```
 This will download jar files in your gradle cache in a directory like `~/.gradle/caches/modules-2/files-2.1/com.llama.llamastack/`
@ -37,11 +37,7 @@ For local inferencing, it is required to include the ExecuTorch library into you
 Include the ExecuTorch library by:
 1. Download the `download-prebuilt-et-lib.sh` script file from the [llama-stack-client-kotlin-client-local](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) directory to your local machine.
-2. Move the script to the top level of your Android app where the app directory resides:
+2. Move the script to the top level of your Android app where the `app` directory resides.
 <p align="center">
 <img src="https://github.com/meta-llama/llama-stack-client-kotlin/blob/latest-release/doc/img/example_android_app_directory.png" style="width:300px">
 </p>
 3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate.
 4. Add the `executorch.aar` dependency in your `build.gradle.kts` file:
 ```
@ -52,6 +48,8 @@ dependencies {
 }
 ```
 See other dependencies for the local RAG in Android app [README](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/examples/android_app#quick-start).
 ## Llama Stack APIs in Your Android App
 Breaking down the demo app, this section will show the core pieces that are used to initialize and run inference with Llama Stack using the Kotlin library.
@ -60,7 +58,7 @@ Start a Llama Stack server on localhost. Here is an example of how you can do th
 ```
 conda create -n stack-fireworks python=3.10
 conda activate stack-fireworks
-pip install --no-cache llama-stack==0.1.4
+pip install --no-cache llama-stack==0.2.2
 llama stack build --template fireworks --image-type conda
 export FIREWORKS_API_KEY=<SOME_KEY>
 llama stack run fireworks --port 5050
--- a/docs/source/distributions/remote_hosted_distro/watsonx.md
+++ b/docs/source/distributions/remote_hosted_distro/watsonx.md
@ -0,0 +1,88 @@
 ---
 orphan: true
 ---
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # watsonx Distribution
 ```{toctree}
 :maxdepth: 2
 :hidden:
 self
 ```
 The `llamastack/distribution-watsonx` distribution consists of the following provider configurations.
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::watsonx`, `inline::sentence-transformers` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
 | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss` |
 ### Environment Variables
 The following environment variables can be configured:
 - `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
 - `WATSONX_API_KEY`: watsonx API Key (default: ``)
 - `WATSONX_PROJECT_ID`: watsonx Project ID (default: ``)
 ### Models
 The following models are available by default:
 - `meta-llama/llama-3-3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
 - `meta-llama/llama-2-13b-chat (aliases: meta-llama/Llama-2-13b)`
 - `meta-llama/llama-3-1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
 - `meta-llama/llama-3-1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
 - `meta-llama/llama-3-2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
 - `meta-llama/llama-3-2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
 - `meta-llama/llama-3-2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
 - `meta-llama/llama-3-2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
 - `meta-llama/llama-guard-3-11b-vision (aliases: meta-llama/Llama-Guard-3-11B-Vision)`
 ### Prerequisite: API Keys
 Make sure you have access to a watsonx API Key. You can get one by referring [watsonx.ai](https://www.ibm.com/docs/en/masv-and-l/maximo-manage/continuous-delivery?topic=setup-create-watsonx-api-key).
 ## Running Llama Stack with watsonx
 You can do this via Conda (build code), venv or Docker which has a pre-built image.
 ### Via Docker
 This method allows you to get started quickly without having to build the distribution code.
 ```bash
 LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-watsonx \
  --config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env WATSONX_API_KEY=$WATSONX_API_KEY \
  --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
  --env WATSONX_BASE_URL=$WATSONX_BASE_URL
 ```
 ### Via Conda
 ```bash
 llama stack build --template watsonx --image-type conda
 llama stack run ./run.yaml \
  --port $LLAMA_STACK_PORT \
  --env WATSONX_API_KEY=$WATSONX_API_KEY \
  --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID
 ```
--- a/docs/source/distributions/self_hosted_distro/bedrock.md
+++ b/docs/source/distributions/self_hosted_distro/bedrock.md
@ -19,7 +19,7 @@ The `llamastack/distribution-bedrock` distribution consists of the following pro
 | safety | `remote::bedrock` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
--- a/docs/source/distributions/self_hosted_distro/cerebras.md
+++ b/docs/source/distributions/self_hosted_distro/cerebras.md
@ -12,7 +12,7 @@ The `llamastack/distribution-cerebras` distribution consists of the following pr
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
@ -52,7 +52,7 @@ docker run \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-cerebras \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY
 ```
--- a/docs/source/distributions/self_hosted_distro/dell.md
+++ b/docs/source/distributions/self_hosted_distro/dell.md
@ -23,7 +23,7 @@ The `llamastack/distribution-dell` distribution consists of the following provid
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
@ -155,7 +155,7 @@ docker run \
  -v $HOME/.llama:/root/.llama \
  -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-dell \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env DEH_URL=$DEH_URL \
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@ -22,7 +22,7 @@ The `llamastack/distribution-fireworks` distribution consists of the following p
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
--- a/docs/source/distributions/self_hosted_distro/groq.md
+++ b/docs/source/distributions/self_hosted_distro/groq.md
@ -22,7 +22,7 @@ The `llamastack/distribution-groq` distribution consists of the following provid
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
 | vector_io | `inline::faiss` |
@ -43,7 +43,9 @@ The following models are available by default:
 - `groq/llama-3.3-70b-versatile (aliases: meta-llama/Llama-3.3-70B-Instruct)`
 - `groq/llama-3.2-3b-preview (aliases: meta-llama/Llama-3.2-3B-Instruct)`
 - `groq/llama-4-scout-17b-16e-instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
 - `groq/meta-llama/llama-4-scout-17b-16e-instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
 - `groq/llama-4-maverick-17b-128e-instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)`
 - `groq/meta-llama/llama-4-maverick-17b-128e-instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)`
 ### Prerequisite: API Keys
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@ -22,7 +22,7 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
@ -81,6 +81,7 @@ LLAMA_STACK_PORT=8321
 docker run \
  -it \
  --pull always \
  --gpu all \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-meta-reference-gpu \
@ -94,6 +95,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
 docker run \
  -it \
  --pull always \
  --gpu all \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-meta-reference-gpu \
--- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
@ -1,123 +0,0 @@
 ---
 orphan: true
 ---
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Meta Reference Quantized Distribution
 ```{toctree}
 :maxdepth: 2
 :hidden:
 self
 ```
 The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists of the following provider configurations:
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `inline::meta-reference-quantized` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
 | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc.
 Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
 ### Environment Variables
 The following environment variables can be configured:
 - `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
 - `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
 - `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)
 ## Prerequisite: Downloading Models
 Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
 ```
 $ llama model list --downloaded
 ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
 ┃ Model                                   ┃ Size     ┃ Modified Time       ┃
 ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
 │ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
 ├─────────────────────────────────────────┼──────────┼─────────────────────┤
 │ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
 └─────────────────────────────────────────┴──────────┴─────────────────────┘
 ```
 ## Running the Distribution
 You can do this via Conda (build code) or Docker which has a pre-built image.
 ### Via Docker
 This method allows you to get started quickly without having to build the distribution code.
 ```bash
 LLAMA_STACK_PORT=8321
 docker run \
  -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-meta-reference-quantized-gpu \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 docker run \
  -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-meta-reference-quantized-gpu \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 ```
 ### Via Conda
 Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
 ```bash
 llama stack build --template meta-reference-quantized-gpu --image-type conda
 llama stack run distributions/meta-reference-quantized-gpu/run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 llama stack run distributions/meta-reference-quantized-gpu/run-with-safety.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 ```
--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@ -6,8 +6,8 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
-| datasetio | `inline::localfs` |
+| datasetio | `inline::localfs`, `remote::nvidia` |
-| eval | `inline::meta-reference` |
+| eval | `remote::nvidia` |
 | inference | `remote::nvidia` |
 | post_training | `remote::nvidia` |
 | safety | `remote::nvidia` |
@ -22,13 +22,13 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 The following environment variables can be configured:
 - `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
- `NVIDIA_USER_ID`: NVIDIA User ID (default: `llama-stack-user`)
+- `NVIDIA_APPEND_API_VERSION`: Whether to append the API version to the base_url (default: `True`)
 - `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
 - `NVIDIA_ACCESS_POLICIES`: NVIDIA Access Policies (default: `{}`)
 - `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
 - `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
 - `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
 - `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
 - `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`)
 - `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
 - `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
@ -45,20 +45,91 @@ The following models are available by default:
 - `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
 - `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
 - `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
 - `meta/llama-3.3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
 - `nvidia/llama-3.2-nv-embedqa-1b-v2 `
 - `nvidia/nv-embedqa-e5-v5 `
 - `nvidia/nv-embedqa-mistral-7b-v2 `
 - `snowflake/arctic-embed-l `
-### Prerequisite: API Keys
+## Prerequisites
 ### NVIDIA API Keys
-Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
+Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
 ### Deploy NeMo Microservices Platform
 The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform.
 ## Supported Services
 Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
 ### Inference: NVIDIA NIM
 NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
  1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
  2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
 The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
 ### Datasetio API: NeMo Data Store
 The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
 See the [NVIDIA Datasetio docs](/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
 ### Eval API: NeMo Evaluator
 The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
 See the [NVIDIA Eval docs](/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
 ### Post-Training API: NeMo Customizer
 The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
 See the [NVIDIA Post-Training docs](/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
 ### Safety API: NeMo Guardrails
 The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
 See the NVIDIA Safety docs for supported features and example usage.
 ## Deploying models
 In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
 Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
 ```sh
 # URL to NeMo NIM Proxy service
 export NEMO_URL="http://nemo.test"
 curl --location "$NEMO_URL/v1/deployment/model-deployments" \
   -H 'accept: application/json' \
   -H 'Content-Type: application/json' \
   -d '{
      "name": "llama-3.2-1b-instruct",
      "namespace": "meta",
      "config": {
         "model": "meta/llama-3.2-1b-instruct",
         "nim_deployment": {
            "image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
            "image_tag": "1.8.3",
            "pvc_size": "25Gi",
            "gpu": 1,
            "additional_envs": {
               "NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
            }
         }
      }
   }'
 ```
 This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference.
 You can also remove a deployed NIM to free up GPU resources, if needed.
 ```sh
 export NEMO_URL="http://nemo.test"
 curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
 ```
 ## Running Llama Stack with NVIDIA
-You can do this via Conda (build code) or Docker which has a pre-built image.
+You can do this via Conda or venv (build code), or Docker which has a pre-built image.
 ### Via Docker
@ -72,7 +143,7 @@ docker run \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-nvidia \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
 ```
@ -80,9 +151,23 @@ docker run \
 ### Via Conda
 ```bash
 INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
 llama stack build --template nvidia --image-type conda
 llama stack run ./run.yaml \
  --port 8321 \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
  --env INFERENCE_MODEL=$INFERENCE_MODEL
 ```
 ### Via venv
 If you've set up your local development environment, you can also build the image using your local virtual environment.
 ```bash
 INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
 llama stack build --template nvidia --image-type venv
 llama stack run ./run.yaml \
  --port 8321 \
  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
  --env INFERENCE_MODEL=$INFERENCE_MODEL
 ```
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@ -19,10 +19,11 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::ollama` |
 | post_training | `inline::huggingface` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
@ -97,7 +98,7 @@ docker run \
  -v ~/.llama:/root/.llama \
  -v ./llama_stack/templates/ollama/run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-ollama \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env SAFETY_MODEL=$SAFETY_MODEL \
--- a/docs/source/distributions/self_hosted_distro/passthrough.md
+++ b/docs/source/distributions/self_hosted_distro/passthrough.md
@ -22,7 +22,7 @@ The `llamastack/distribution-passthrough` distribution consists of the following
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@ -21,7 +21,7 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
@ -41,10 +41,10 @@ The following environment variables can be configured:
 ## Setting up vLLM server
-In the following sections, we'll use either AMD and NVIDIA GPUs to serve as hardware accelerators for the vLLM
+In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM
 server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
 [supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
-that we only use GPUs here for demonstration purposes.
+that we only use GPUs here for demonstration purposes. Note that if you run into issues, you can include the environment variable `--env VLLM_DEBUG_LOG_API_SERVER_RESPONSE=true` (available in vLLM v0.8.3 and above) in the `docker run` command to enable log response from API server for debugging.
 ### Setting up vLLM server on AMD GPU
@ -162,6 +162,55 @@ docker run \
    --port $SAFETY_PORT
 ```
 ### Setting up vLLM server on Intel GPU
 Refer to [vLLM Documentation for XPU](https://docs.vllm.ai/en/v0.8.2/getting_started/installation/gpu.html?device=xpu) to get a vLLM endpoint. In addition to vLLM side setup which guides towards installing vLLM from sources orself-building vLLM Docker container, Intel provides prebuilt vLLM container to use on systems with Intel GPUs supported by PyTorch XPU backend:
 - [intel/vllm](https://hub.docker.com/r/intel/vllm)
 Here is a sample script to start a vLLM server locally via Docker using Intel provided container:
 ```bash
 export INFERENCE_PORT=8000
 export INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct
 export ZE_AFFINITY_MASK=0
 docker run \
    --pull always \
    --device /dev/dri \
    -v /dev/dri/by-path:/dev/dri/by-path \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
    --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
    -p $INFERENCE_PORT:$INFERENCE_PORT \
    --ipc=host \
    intel/vllm:xpu \
    --gpu-memory-utilization 0.7 \
    --model $INFERENCE_MODEL \
    --port $INFERENCE_PORT
 ```
 If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
 ```bash
 export SAFETY_PORT=8081
 export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 export ZE_AFFINITY_MASK=1
 docker run \
    --pull always \
    --device /dev/dri \
    -v /dev/dri/by-path:/dev/dri/by-path \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
    --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
    -p $SAFETY_PORT:$SAFETY_PORT \
    --ipc=host \
    intel/vllm:xpu \
    --gpu-memory-utilization 0.7 \
    --model $SAFETY_MODEL \
    --port $SAFETY_PORT
 ```
 ## Running Llama Stack
 Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
@ -184,7 +233,7 @@ docker run \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./llama_stack/templates/remote-vllm/run.yaml:/root/my-run.yaml \
  llamastack/distribution-remote-vllm \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1
@ -206,7 +255,7 @@ docker run \
  -v ~/.llama:/root/.llama \
  -v ./llama_stack/templates/remote-vllm/run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-remote-vllm \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1 \
--- a/docs/source/distributions/self_hosted_distro/sambanova.md
+++ b/docs/source/distributions/self_hosted_distro/sambanova.md
@ -16,10 +16,10 @@ The `llamastack/distribution-sambanova` distribution consists of the following p
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
-| inference | `remote::sambanova` |
+| inference | `remote::sambanova`, `inline::sentence-transformers` |
-| safety | `inline::llama-guard` |
+| safety | `remote::sambanova` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
@ -28,53 +28,64 @@ The `llamastack/distribution-sambanova` distribution consists of the following p
 The following environment variables can be configured:
 - `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
- `SAMBANOVA_API_KEY`: SambaNova.AI API Key (default: ``)
+- `SAMBANOVA_API_KEY`: SambaNova API Key (default: ``)
 ### Models
 The following models are available by default:
- `Meta-Llama-3.1-8B-Instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
+- `sambanova/Meta-Llama-3.1-8B-Instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `Meta-Llama-3.1-70B-Instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
+- `sambanova/Meta-Llama-3.1-405B-Instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
- `Meta-Llama-3.1-405B-Instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
+- `sambanova/Meta-Llama-3.2-1B-Instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
- `Meta-Llama-3.2-1B-Instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
+- `sambanova/Meta-Llama-3.2-3B-Instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `Meta-Llama-3.2-3B-Instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
+- `sambanova/Meta-Llama-3.3-70B-Instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
- `Meta-Llama-3.3-70B-Instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
+- `sambanova/Llama-3.2-11B-Vision-Instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
- `Llama-3.2-11B-Vision-Instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
+- `sambanova/Llama-3.2-90B-Vision-Instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
- `Llama-3.2-90B-Vision-Instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
+- `sambanova/Llama-4-Scout-17B-16E-Instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
- `Meta-Llama-Guard-3-8B (aliases: meta-llama/Llama-Guard-3-8B)`
+- `sambanova/Llama-4-Maverick-17B-128E-Instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)`
- `Llama-4-Scout-17B-16E-Instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
+- `sambanova/Meta-Llama-Guard-3-8B (aliases: meta-llama/Llama-Guard-3-8B)`
 ### Prerequisite: API Keys
-Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaNova.ai](https://sambanova.ai/).
+Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaNova.ai](http://cloud.sambanova.ai?utm_source=llamastack&utm_medium=external&utm_campaign=cloud_signup).
 ## Running Llama Stack with SambaNova
 You can do this via Conda (build code) or Docker which has a pre-built image.
 ### Via Docker
-This method allows you to get started quickly without having to build the distribution code.
+### Via Docker
 ```bash
 LLAMA_STACK_PORT=8321
 llama stack build --template sambanova --image-type container
 docker run \
  -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  llamastack/distribution-sambanova \
+  -v ~/.llama:/root/.llama \
  distribution-sambanova \
  --port $LLAMA_STACK_PORT \
  --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
 ```
 ### Via Venv
 ```bash
 llama stack build --template sambanova --image-type venv
 llama stack run --image-type venv ~/.llama/distributions/sambanova/sambanova-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
 ```
 ### Via Conda
 ```bash
 llama stack build --template sambanova --image-type conda
-llama stack run ./run.yaml \
+llama stack run --image-type conda ~/.llama/distributions/sambanova/sambanova-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
 ```
--- a/docs/source/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/distributions/self_hosted_distro/tgi.md
@ -23,7 +23,7 @@ The `llamastack/distribution-tgi` distribution consists of the following provide
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
@ -117,7 +117,7 @@ docker run \
  -v ~/.llama:/root/.llama \
  -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-tgi \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT \
--- a/docs/source/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@ -22,7 +22,7 @@ The `llamastack/distribution-together` distribution consists of the following pr
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
--- a/docs/source/getting_started/detailed_tutorial.md
+++ b/docs/source/getting_started/detailed_tutorial.md
@ -42,7 +42,7 @@ powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | ie
 Setup your virtual environment.
 ```bash
-uv venv --python 3.10
+uv sync --python 3.10
 source .venv/bin/activate
 ```
 ## Step 2:  Run Llama Stack
@ -445,7 +445,6 @@ from llama_stack_client import LlamaStackClient
 from llama_stack_client import Agent, AgentEventLogger
 from llama_stack_client.types import Document
 import uuid
 from termcolor import cprint
 client = LlamaStackClient(base_url="http://localhost:8321")
@ -463,7 +462,6 @@ urls = [
    "memory_optimizations.rst",
    "chat.rst",
    "llama3.rst",
    "datasets.rst",
    "qat_finetune.rst",
    "lora_finetune.rst",
 ]
--- a/docs/source/providers/external.md
+++ b/docs/source/providers/external.md
@ -10,7 +10,7 @@ Llama Stack supports external providers that live outside of the main codebase.
 To enable external providers, you need to configure the `external_providers_dir` in your Llama Stack configuration. This directory should contain your external provider specifications:
 ```yaml
-external_providers_dir: /etc/llama-stack/providers.d/
+external_providers_dir: ~/.llama/providers.d/
 ```
 ## Directory Structure
@ -50,9 +50,12 @@ Llama Stack supports two types of external providers:
 Here's a list of known external providers that you can use with Llama Stack:
-| Type | Name | Description | Repository |
+| Name | Description | API | Type | Repository |
-|------|------|-------------|------------|
+|------|-------------|-----|------|------------|
-| Remote | KubeFlow Training | Train models with KubeFlow | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) |
+| KubeFlow Training | Train models with KubeFlow | Post Training | Remote | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) |
 | KubeFlow Pipelines | Train models with KubeFlow Pipelines | Post Training | Inline **and** Remote | [llama-stack-provider-kfp-trainer](https://github.com/opendatahub-io/llama-stack-provider-kfp-trainer) |
 | RamaLama | Inference models with RamaLama | Inference | Remote | [ramalama-stack](https://github.com/containers/ramalama-stack) |
 | TrustyAI LM-Eval | Evaluate models with TrustyAI LM-Eval | Eval | Remote | [llama-stack-provider-lmeval](https://github.com/trustyai-explainability/llama-stack-provider-lmeval) |
 ### Remote Provider Specification
@ -179,7 +182,7 @@ dependencies = ["llama-stack", "pydantic", "ollama", "aiohttp"]
 3. Create the provider specification:
 ```yaml
-# /etc/llama-stack/providers.d/remote/inference/custom_ollama.yaml
+# ~/.llama/providers.d/remote/inference/custom_ollama.yaml
 adapter:
  adapter_type: custom_ollama
  pip_packages: ["ollama", "aiohttp"]
@ -198,7 +201,7 @@ uv pip install -e .
 5. Configure Llama Stack to use external providers:
 ```yaml
-external_providers_dir: /etc/llama-stack/providers.d/
+external_providers_dir: ~/.llama/providers.d/
 ```
 The provider will now be available in Llama Stack with the type `remote::custom_ollama`.
--- a/docs/source/providers/index.md
+++ b/docs/source/providers/index.md
@ -30,6 +30,18 @@ Runs inference with an LLM.
 ## Post Training
 Fine-tunes a model.
 #### Post Training Providers
 The following providers are available for Post Training:
 ```{toctree}
 :maxdepth: 1
 external
 post_training/huggingface
 post_training/torchtune
 post_training/nvidia_nemo
 ```
 ## Safety
 Applies safety policies to the output at a Systems (not only model) level.
--- a/docs/source/providers/post_training/huggingface.md
+++ b/docs/source/providers/post_training/huggingface.md
@ -0,0 +1,122 @@
 ---
 orphan: true
 ---
 # HuggingFace SFTTrainer
 [HuggingFace SFTTrainer](https://huggingface.co/docs/trl/en/sft_trainer) is an inline post training provider for Llama Stack. It allows you to run supervised fine tuning on a variety of models using many datasets
 ## Features
 - Simple access through the post_training API
 - Fully integrated with Llama Stack
 - GPU support, CPU support, and MPS support (MacOS Metal Performance Shaders)
 ## Usage
 To use the HF SFTTrainer in your Llama Stack project, follow these steps:
 1. Configure your Llama Stack project to use this provider.
 2. Kick off a SFT job using the Llama Stack post_training API.
 ## Setup
 You can access the HuggingFace trainer via the `ollama` distribution:
 ```bash
 llama stack build --template ollama --image-type venv
 llama stack run --image-type venv ~/.llama/distributions/ollama/ollama-run.yaml
 ```
 ## Run Training
 You can access the provider and the `supervised_fine_tune` method via the post_training API:
 ```python
 import time
 import uuid
 from llama_stack_client.types import (
    post_training_supervised_fine_tune_params,
    algorithm_config_param,
 )
 def create_http_client():
    from llama_stack_client import LlamaStackClient
    return LlamaStackClient(base_url="http://localhost:8321")
 client = create_http_client()
 # Example Dataset
 client.datasets.register(
    purpose="post-training/messages",
    source={
        "type": "uri",
        "uri": "huggingface://datasets/llamastack/simpleqa?split=train",
    },
    dataset_id="simpleqa",
 )
 training_config = post_training_supervised_fine_tune_params.TrainingConfig(
    data_config=post_training_supervised_fine_tune_params.TrainingConfigDataConfig(
        batch_size=32,
        data_format="instruct",
        dataset_id="simpleqa",
        shuffle=True,
    ),
    gradient_accumulation_steps=1,
    max_steps_per_epoch=0,
    max_validation_steps=1,
    n_epochs=4,
 )
 algorithm_config = algorithm_config_param.LoraFinetuningConfig(  # this config is also currently mandatory but should not be
    alpha=1,
    apply_lora_to_mlp=True,
    apply_lora_to_output=False,
    lora_attn_modules=["q_proj"],
    rank=1,
    type="LoRA",
 )
 job_uuid = f"test-job{uuid.uuid4()}"
 # Example Model
 training_model = "ibm-granite/granite-3.3-8b-instruct"
 start_time = time.time()
 response = client.post_training.supervised_fine_tune(
    job_uuid=job_uuid,
    logger_config={},
    model=training_model,
    hyperparam_search_config={},
    training_config=training_config,
    algorithm_config=algorithm_config,
    checkpoint_dir="output",
 )
 print("Job: ", job_uuid)
 # Wait for the job to complete!
 while True:
    status = client.post_training.job.status(job_uuid=job_uuid)
    if not status:
        print("Job not found")
        break
    print(status)
    if status.status == "completed":
        break
    print("Waiting for job to complete...")
    time.sleep(5)
 end_time = time.time()
 print("Job completed in", end_time - start_time, "seconds!")
 print("Artifacts:")
 print(client.post_training.job.artifacts(job_uuid=job_uuid))
 ```
--- a/docs/source/providers/post_training/nvidia_nemo.md
+++ b/docs/source/providers/post_training/nvidia_nemo.md
@ -0,0 +1,163 @@
 ---
 orphan: true
 ---
 # NVIDIA NEMO
 [NVIDIA NEMO](https://developer.nvidia.com/nemo-framework) is a remote post training provider for Llama Stack. It provides enterprise-grade fine-tuning capabilities through NVIDIA's NeMo Customizer service.
 ## Features
 - Enterprise-grade fine-tuning capabilities
 - Support for LoRA and SFT fine-tuning
 - Integration with NVIDIA's NeMo Customizer service
 - Support for various NVIDIA-optimized models
 - Efficient training with NVIDIA hardware acceleration
 ## Usage
 To use NVIDIA NEMO in your Llama Stack project, follow these steps:
 1. Configure your Llama Stack project to use this provider.
 2. Set up your NVIDIA API credentials.
 3. Kick off a fine-tuning job using the Llama Stack post_training API.
 ## Setup
 You'll need to set the following environment variables:
 ```bash
 export NVIDIA_API_KEY="your-api-key"
 export NVIDIA_DATASET_NAMESPACE="default"
 export NVIDIA_CUSTOMIZER_URL="your-customizer-url"
 export NVIDIA_PROJECT_ID="your-project-id"
 export NVIDIA_OUTPUT_MODEL_DIR="your-output-model-dir"
 ```
 ## Run Training
 You can access the provider and the `supervised_fine_tune` method via the post_training API:
 ```python
 import time
 import uuid
 from llama_stack_client.types import (
    post_training_supervised_fine_tune_params,
    algorithm_config_param,
 )
 def create_http_client():
    from llama_stack_client import LlamaStackClient
    return LlamaStackClient(base_url="http://localhost:8321")
 client = create_http_client()
 # Example Dataset
 client.datasets.register(
    purpose="post-training/messages",
    source={
        "type": "uri",
        "uri": "huggingface://datasets/llamastack/simpleqa?split=train",
    },
    dataset_id="simpleqa",
 )
 training_config = post_training_supervised_fine_tune_params.TrainingConfig(
    data_config=post_training_supervised_fine_tune_params.TrainingConfigDataConfig(
        batch_size=8,  # Default batch size for NEMO
        data_format="instruct",
        dataset_id="simpleqa",
        shuffle=True,
    ),
    n_epochs=50,  # Default epochs for NEMO
    optimizer_config=post_training_supervised_fine_tune_params.TrainingConfigOptimizerConfig(
        lr=0.0001,  # Default learning rate
        weight_decay=0.01,  # NEMO-specific parameter
    ),
    # NEMO-specific parameters
    log_every_n_steps=None,
    val_check_interval=0.25,
    sequence_packing_enabled=False,
    hidden_dropout=None,
    attention_dropout=None,
    ffn_dropout=None,
 )
 algorithm_config = algorithm_config_param.LoraFinetuningConfig(
    alpha=16,  # Default alpha for NEMO
    type="LoRA",
 )
 job_uuid = f"test-job{uuid.uuid4()}"
 # Example Model - must be a supported NEMO model
 training_model = "meta/llama-3.1-8b-instruct"
 start_time = time.time()
 response = client.post_training.supervised_fine_tune(
    job_uuid=job_uuid,
    logger_config={},
    model=training_model,
    hyperparam_search_config={},
    training_config=training_config,
    algorithm_config=algorithm_config,
    checkpoint_dir="output",
 )
 print("Job: ", job_uuid)
 # Wait for the job to complete!
 while True:
    status = client.post_training.job.status(job_uuid=job_uuid)
    if not status:
        print("Job not found")
        break
    print(status)
    if status.status == "completed":
        break
    print("Waiting for job to complete...")
    time.sleep(5)
 end_time = time.time()
 print("Job completed in", end_time - start_time, "seconds!")
 print("Artifacts:")
 print(client.post_training.job.artifacts(job_uuid=job_uuid))
 ```
 ## Supported Models
 Currently supports the following models:
 - meta/llama-3.1-8b-instruct
 - meta/llama-3.2-1b-instruct
 ## Supported Parameters
 ### TrainingConfig
 - n_epochs (default: 50)
 - data_config
 - optimizer_config
 - log_every_n_steps
 - val_check_interval (default: 0.25)
 - sequence_packing_enabled (default: False)
 - hidden_dropout (0.0-1.0)
 - attention_dropout (0.0-1.0)
 - ffn_dropout (0.0-1.0)
 ### DataConfig
 - dataset_id
 - batch_size (default: 8)
 ### OptimizerConfig
 - lr (default: 0.0001)
 - weight_decay (default: 0.01)
 ### LoRA Config
 - alpha (default: 16)
 - type (must be "LoRA")
 Note: Some parameters from the standard Llama Stack API are not supported and will be ignored with a warning.
--- a/docs/source/providers/post_training/torchtune.md
+++ b/docs/source/providers/post_training/torchtune.md
@ -0,0 +1,125 @@
 ---
 orphan: true
 ---
 # TorchTune
 [TorchTune](https://github.com/pytorch/torchtune) is an inline post training provider for Llama Stack. It provides a simple and efficient way to fine-tune language models using PyTorch.
 ## Features
 - Simple access through the post_training API
 - Fully integrated with Llama Stack
 - GPU support and single device capabilities.
 - Support for LoRA
 ## Usage
 To use TorchTune in your Llama Stack project, follow these steps:
 1. Configure your Llama Stack project to use this provider.
 2. Kick off a fine-tuning job using the Llama Stack post_training API.
 ## Setup
 You can access the TorchTune trainer by writing your own yaml pointing to the provider:
 ```yaml
 post_training:
  - provider_id: torchtune
    provider_type: inline::torchtune
    config: {}
 ```
 you can then build and run your own stack with this provider.
 ## Run Training
 You can access the provider and the `supervised_fine_tune` method via the post_training API:
 ```python
 import time
 import uuid
 from llama_stack_client.types import (
    post_training_supervised_fine_tune_params,
    algorithm_config_param,
 )
 def create_http_client():
    from llama_stack_client import LlamaStackClient
    return LlamaStackClient(base_url="http://localhost:8321")
 client = create_http_client()
 # Example Dataset
 client.datasets.register(
    purpose="post-training/messages",
    source={
        "type": "uri",
        "uri": "huggingface://datasets/llamastack/simpleqa?split=train",
    },
    dataset_id="simpleqa",
 )
 training_config = post_training_supervised_fine_tune_params.TrainingConfig(
    data_config=post_training_supervised_fine_tune_params.TrainingConfigDataConfig(
        batch_size=32,
        data_format="instruct",
        dataset_id="simpleqa",
        shuffle=True,
    ),
    gradient_accumulation_steps=1,
    max_steps_per_epoch=0,
    max_validation_steps=1,
    n_epochs=4,
 )
 algorithm_config = algorithm_config_param.LoraFinetuningConfig(
    alpha=1,
    apply_lora_to_mlp=True,
    apply_lora_to_output=False,
    lora_attn_modules=["q_proj"],
    rank=1,
    type="LoRA",
 )
 job_uuid = f"test-job{uuid.uuid4()}"
 # Example Model
 training_model = "meta-llama/Llama-2-7b-hf"
 start_time = time.time()
 response = client.post_training.supervised_fine_tune(
    job_uuid=job_uuid,
    logger_config={},
    model=training_model,
    hyperparam_search_config={},
    training_config=training_config,
    algorithm_config=algorithm_config,
    checkpoint_dir="output",
 )
 print("Job: ", job_uuid)
 # Wait for the job to complete!
 while True:
    status = client.post_training.job.status(job_uuid=job_uuid)
    if not status:
        print("Job not found")
        break
    print(status)
    if status.status == "completed":
        break
    print("Waiting for job to complete...")
    time.sleep(5)
 end_time = time.time()
 print("Job completed in", end_time - start_time, "seconds!")
 print("Artifacts:")
 print(client.post_training.job.artifacts(job_uuid=job_uuid))
 ```
--- a/docs/source/providers/vector_io/milvus.md
+++ b/docs/source/providers/vector_io/milvus.md
@ -0,0 +1,107 @@
 ---
 orphan: true
 ---
 # Milvus
 [Milvus](https://milvus.io/) is an inline and remote vector database provider for Llama Stack. It
 allows you to store and query vectors directly within a Milvus database.
 That means you're not limited to storing vectors in memory or in a separate service.
 ## Features
 - Easy to use
 - Fully integrated with Llama Stack
 ## Usage
 To use Milvus in your Llama Stack project, follow these steps:
 1. Install the necessary dependencies.
 2. Configure your Llama Stack project to use Milvus.
 3. Start storing and querying vectors.
 ## Installation
 You can install Milvus using pymilvus:
 ```bash
 pip install pymilvus
 ```
 ## Configuration
 In Llama Stack, Milvus can be configured in two ways:
 - **Inline (Local) Configuration** - Uses Milvus-Lite for local storage
 - **Remote Configuration** - Connects to a remote Milvus server
 ### Inline (Local) Configuration
 The simplest method is local configuration, which requires setting `db_path`, a path for locally storing Milvus-Lite files:
 ```yaml
 vector_io:
  - provider_id: milvus
    provider_type: inline::milvus
    config:
      db_path: ~/.llama/distributions/together/milvus_store.db
 ```
 ### Remote Configuration
 Remote configuration is suitable for larger data storage requirements:
 #### Standard Remote Connection
 ```yaml
 vector_io:
  - provider_id: milvus
    provider_type: remote::milvus
    config:
      uri: "http://<host>:<port>"
      token: "<user>:<password>"
 ```
 #### TLS-Enabled Remote Connection (One-way TLS)
 For connections to Milvus instances with one-way TLS enabled:
 ```yaml
 vector_io:
  - provider_id: milvus
    provider_type: remote::milvus
    config:
      uri: "https://<host>:<port>"
      token: "<user>:<password>"
      secure: True
      server_pem_path: "/path/to/server.pem"
 ```
 #### Mutual TLS (mTLS) Remote Connection
 For connections to Milvus instances with mutual TLS (mTLS) enabled:
 ```yaml
 vector_io:
  - provider_id: milvus
    provider_type: remote::milvus
    config:
      uri: "https://<host>:<port>"
      token: "<user>:<password>"
      secure: True
      ca_pem_path: "/path/to/ca.pem"
      client_pem_path: "/path/to/client.pem"
      client_key_path: "/path/to/client.key"
 ```
 #### Key Parameters for TLS Configuration
 - **`secure`**: Enables TLS encryption when set to `true`. Defaults to `false`.
 - **`server_pem_path`**: Path to the **server certificate** for verifying the server’s identity (used in one-way TLS).
 - **`ca_pem_path`**: Path to the **Certificate Authority (CA) certificate** for validating the server certificate (required in mTLS).
 - **`client_pem_path`**: Path to the **client certificate** file (required for mTLS).
 - **`client_key_path`**: Path to the **client private key** file (required for mTLS).
 ## Documentation
 See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.
 For more details on TLS configuration, refer to the [TLS setup guide](https://milvus.io/docs/tls.md).
--- a/docs/source/providers/vector_io/mivus.md
+++ b/docs/source/providers/vector_io/mivus.md
@ -1,31 +0,0 @@
 ---
 orphan: true
 ---
 # Milvus
 [Milvus](https://milvus.io/) is an inline and remote vector database provider for Llama Stack. It
 allows you to store and query vectors directly within a Milvus database.
 That means you're not limited to storing vectors in memory or in a separate service.
 ## Features
 - Easy to use
 - Fully integrated with Llama Stack
 ## Usage
 To use Milvus in your Llama Stack project, follow these steps:
 1. Install the necessary dependencies.
 2. Configure your Llama Stack project to use Milvus.
 3. Start storing and querying vectors.
 ## Installation
 You can install Milvus using pymilvus:
 ```bash
 pip install pymilvus
 ```
 ## Documentation
 See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.
--- a/docs/source/providers/vector_io/sqlite-vec.md
+++ b/docs/source/providers/vector_io/sqlite-vec.md
@ -66,6 +66,25 @@ To use sqlite-vec in your Llama Stack project, follow these steps:
 2. Configure your Llama Stack project to use SQLite-Vec.
 3. Start storing and querying vectors.
 ## Supported Search Modes
 The sqlite-vec provider supports both vector-based and keyword-based (full-text) search modes.
 When using the RAGTool interface, you can specify the desired search behavior via the `mode` parameter in
 `RAGQueryConfig`. For example:
 ```python
 from llama_stack.apis.tool_runtime.rag import RAGQueryConfig
 query_config = RAGQueryConfig(max_chunks=6, mode="vector")
 results = client.tool_runtime.rag_tool.query(
    vector_db_ids=[vector_db_id],
    content="what is torchtune",
    query_config=query_config,
 )
 ```
 ## Installation
 You can install SQLite-Vec using pip:
--- a/docs/source/references/llama_stack_client_cli_reference.md
+++ b/docs/source/references/llama_stack_client_cli_reference.md
@ -253,8 +253,6 @@ llama-stack-client toolgroups list
 +---------------------------+------------------+------+---------------+
 | identifier                | provider_id      | args | mcp_endpoint  |
 +===========================+==================+======+===============+
 | builtin::code_interpreter | code-interpreter | None | None          |
 +---------------------------+------------------+------+---------------+
 | builtin::rag              | rag-runtime      | None | None          |
 +---------------------------+------------------+------+---------------+
 | builtin::websearch        | tavily-search    | None | None          |
--- a/docs/zero_to_hero_guide/00_Inference101.ipynb
+++ b/docs/zero_to_hero_guide/00_Inference101.ipynb
@ -389,5 +389,7 @@
      "pygments_lexer": "ipython3",
      "version": "3.10.15"
    }
-  }
+  },
  "nbformat": 4,
  "nbformat_minor": 5
 }
--- a/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
+++ b/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
@ -256,5 +256,7 @@
      "pygments_lexer": "ipython3",
      "version": "3.10.15"
    }
-  }
+  },
  "nbformat": 4,
  "nbformat_minor": 5
 }
--- a/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
+++ b/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
@ -301,5 +301,7 @@
      "pygments_lexer": "ipython3",
      "version": "3.12.2"
    }
-  }
+  },
  "nbformat": 4,
  "nbformat_minor": 5
 }
--- a/docs/zero_to_hero_guide/03_Image_Chat101.ipynb
+++ b/docs/zero_to_hero_guide/03_Image_Chat101.ipynb
@ -200,5 +200,7 @@
      "pygments_lexer": "ipython3",
      "version": "3.12.2"
    }
-  }
+  },
  "nbformat": 4,
  "nbformat_minor": 5
 }
--- a/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb
+++ b/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb
@ -355,5 +355,7 @@
      "pygments_lexer": "ipython3",
      "version": "3.10.15"
    }
-  }
+  },
  "nbformat": 4,
  "nbformat_minor": 5
 }
--- a/docs/zero_to_hero_guide/05_Memory101.ipynb
+++ b/docs/zero_to_hero_guide/05_Memory101.ipynb
@ -398,5 +398,7 @@
      "pygments_lexer": "ipython3",
      "version": "3.10.15"
    }
-  }
+  },
  "nbformat": 4,
  "nbformat_minor": 5
 }
--- a/docs/zero_to_hero_guide/06_Safety101.ipynb
+++ b/docs/zero_to_hero_guide/06_Safety101.ipynb
@ -132,5 +132,7 @@
      "pygments_lexer": "ipython3",
      "version": "3.11.10"
    }
-  }
+  },
  "nbformat": 4,
  "nbformat_minor": 5
 }
--- a/docs/zero_to_hero_guide/07_Agents101.ipynb
+++ b/docs/zero_to_hero_guide/07_Agents101.ipynb
@ -188,5 +188,7 @@
      "pygments_lexer": "ipython3",
      "version": "3.10.15"
    }
-  }
+  },
  "nbformat": 4,
  "nbformat_minor": 5
 }
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@ -86,11 +86,11 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
   llama stack build --template ollama --image-type conda
   ```
   **Expected Output:**
-   ```
+   ```bash
   ...
-   Build Successful! Next steps:
+   Build Successful!
-   1. Set the environment variables: LLAMA_STACK_PORT, OLLAMA_URL, INFERENCE_MODEL, SAFETY_MODEL
+   You can find the newly-built template here: ~/.llama/distributions/ollama/ollama-run.yaml
-   2. `llama stack run /Users/<username>/.llama/distributions/llamastack-ollama/ollama-run.yaml
+   You can run the new Llama Stack Distro via: llama stack run ~/.llama/distributions/ollama/ollama-run.yaml --image-type conda
   ```
 3. **Set the ENV variables by exporting them to the terminal**:
--- a/install.sh
+++ b/install.sh
@ -0,0 +1,206 @@
 #!/usr/bin/env bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 set -Eeuo pipefail
 PORT=8321
 OLLAMA_PORT=11434
 MODEL_ALIAS="llama3.2:3b"
 SERVER_IMAGE="llamastack/distribution-ollama:0.2.2"
 WAIT_TIMEOUT=300
 log(){ printf "\e[1;32m%s\e[0m\n" "$*"; }
 die(){ printf "\e[1;31m❌ %s\e[0m\n" "$*" >&2; exit 1; }
 wait_for_service() {
  local url="$1"
  local pattern="$2"
  local timeout="$3"
  local name="$4"
  local start ts
  log "⏳  Waiting for ${name}…"
  start=$(date +%s)
  while true; do
    if curl --retry 5 --retry-delay 1 --retry-max-time "$timeout" --retry-all-errors --silent --fail "$url" 2>/dev/null | grep -q "$pattern"; then
      break
    fi
    ts=$(date +%s)
    if (( ts - start >= timeout )); then
      return 1
    fi
    printf '.'
    sleep 1
  done
  return 0
 }
 usage() {
    cat << EOF
 📚 Llama-Stack Deployment Script
 Description:
    This script sets up and deploys Llama-Stack with Ollama integration in containers.
    It handles both Docker and Podman runtimes and includes automatic platform detection.
 Usage:
    $(basename "$0") [OPTIONS]
 Options:
    -p, --port PORT            Server port for Llama-Stack (default: ${PORT})
    -o, --ollama-port PORT     Ollama service port (default: ${OLLAMA_PORT})
    -m, --model MODEL          Model alias to use (default: ${MODEL_ALIAS})
    -i, --image IMAGE          Server image (default: ${SERVER_IMAGE})
    -t, --timeout SECONDS      Service wait timeout in seconds (default: ${WAIT_TIMEOUT})
    -h, --help                 Show this help message
 For more information:
    Documentation: https://llama-stack.readthedocs.io/
    GitHub: https://github.com/meta-llama/llama-stack
 Report issues:
    https://github.com/meta-llama/llama-stack/issues
 EOF
 }
 # Parse command line arguments
 while [[ $# -gt 0 ]]; do
    case $1 in
        -h|--help)
            usage
            exit 0
            ;;
        -p|--port)
            PORT="$2"
            shift 2
            ;;
        -o|--ollama-port)
            OLLAMA_PORT="$2"
            shift 2
            ;;
        -m|--model)
            MODEL_ALIAS="$2"
            shift 2
            ;;
        -i|--image)
            SERVER_IMAGE="$2"
            shift 2
            ;;
        -t|--timeout)
            WAIT_TIMEOUT="$2"
            shift 2
            ;;
        *)
            die "Unknown option: $1"
            ;;
    esac
 done
 if command -v docker &> /dev/null; then
  ENGINE="docker"
 elif command -v podman &> /dev/null; then
  ENGINE="podman"
 else
  die "Docker or Podman is required. Install Docker: https://docs.docker.com/get-docker/ or Podman: https://podman.io/getting-started/installation"
 fi
 # Explicitly set the platform for the host architecture
 HOST_ARCH="$(uname -m)"
 if [ "$HOST_ARCH" = "arm64" ]; then
  if [ "$ENGINE" = "docker" ]; then
    PLATFORM_OPTS=( --platform linux/amd64 )
  else
    PLATFORM_OPTS=( --os linux --arch amd64 )
  fi
 else
  PLATFORM_OPTS=()
 fi
 # macOS + Podman: ensure VM is running before we try to launch containers
 # If you need GPU passthrough under Podman on macOS, init the VM with libkrun:
 #   CONTAINERS_MACHINE_PROVIDER=libkrun podman machine init
 if [ "$ENGINE" = "podman" ] && [ "$(uname -s)" = "Darwin" ]; then
  if ! podman info &>/dev/null; then
    log "⌛️ Initializing Podman VM…"
    podman machine init &>/dev/null || true
    podman machine start &>/dev/null || true
    log "⌛️  Waiting for Podman API…"
    until podman info &>/dev/null; do
      sleep 1
    done
    log "✅  Podman VM is up"
  fi
 fi
 # Clean up any leftovers from earlier runs
 for name in ollama-server llama-stack; do
  ids=$($ENGINE ps -aq --filter "name=^${name}$")
  if [ -n "$ids" ]; then
    log "⚠️   Found existing container(s) for '${name}', removing…"
    $ENGINE rm -f "$ids" > /dev/null 2>&1
  fi
 done
 ###############################################################################
 # 0. Create a shared network
 ###############################################################################
 if ! $ENGINE network inspect llama-net >/dev/null 2>&1; then
  log "🌐  Creating network…"
  $ENGINE network create llama-net >/dev/null 2>&1
 fi
 ###############################################################################
 # 1. Ollama
 ###############################################################################
 log "🦙  Starting Ollama…"
 $ENGINE run -d "${PLATFORM_OPTS[@]}" --name ollama-server \
  --network llama-net \
  -p "${OLLAMA_PORT}:${OLLAMA_PORT}" \
  ollama/ollama > /dev/null 2>&1
 if ! wait_for_service "http://localhost:${OLLAMA_PORT}/" "Ollama" "$WAIT_TIMEOUT" "Ollama daemon"; then
  log "❌  Ollama daemon did not become ready in ${WAIT_TIMEOUT}s; dumping container logs:"
  $ENGINE logs --tail 200 ollama-server
  die "Ollama startup failed"
 fi
 log "📦  Ensuring model is pulled: ${MODEL_ALIAS}…"
 if ! $ENGINE exec ollama-server ollama pull "${MODEL_ALIAS}" > /dev/null 2>&1; then
  log "❌  Failed to pull model ${MODEL_ALIAS}; dumping container logs:"
  $ENGINE logs --tail 200 ollama-server
  die "Model pull failed"
 fi
 ###############################################################################
 # 2. Llama‑Stack
 ###############################################################################
 cmd=( run -d "${PLATFORM_OPTS[@]}" --name llama-stack \
      --network llama-net \
      -p "${PORT}:${PORT}" \
      "${SERVER_IMAGE}" --port "${PORT}" \
      --env INFERENCE_MODEL="${MODEL_ALIAS}" \
      --env OLLAMA_URL="http://ollama-server:${OLLAMA_PORT}" )
 log "🦙  Starting Llama‑Stack…"
 $ENGINE "${cmd[@]}" > /dev/null 2>&1
 if ! wait_for_service "http://127.0.0.1:${PORT}/v1/health" "OK" "$WAIT_TIMEOUT" "Llama-Stack API"; then
  log "❌  Llama-Stack did not become ready in ${WAIT_TIMEOUT}s; dumping container logs:"
  $ENGINE logs --tail 200 llama-stack
  die "Llama-Stack startup failed"
 fi
 ###############################################################################
 # Done
 ###############################################################################
 log ""
 log "🎉  Llama‑Stack is ready!"
 log "👉  API endpoint: http://localhost:${PORT}"
 log "📖 Documentation: https://llama-stack.readthedocs.io/en/latest/references/index.html"
 log "💻 To access the llama‑stack CLI, exec into the container:"
 log "   $ENGINE exec -ti llama-stack bash"
 log ""
--- a/kvant_build_local.sh
+++ b/kvant_build_local.sh
@ -0,0 +1,6 @@
 #!/usr/bin/env bash
 export USE_COPY_NOT_MOUNT=true
 export LLAMA_STACK_DIR=.
 uvx --from . llama stack build --template kvant --image-type container  --image-name kvant
--- a/kvant_start_local.sh
+++ b/kvant_start_local.sh
@ -0,0 +1,17 @@
 #!/usr/bin/env bash
 export LLAMA_STACK_PORT=8321
 # VLLM_API_TOKEN= env file
 # KEYCLOAK_CLIENT_SECRET= env file
 docker run -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v $(pwd)/data:/root/.llama \
  --mount type=bind,source="$(pwd)"/llama_stack/templates/kvant/run.yaml,target=/root/.llama/config.yaml,readonly \
  --entrypoint python \
  --env-file ./.env \
  distribution-kvant:dev \
  -m llama_stack.distribution.server.server  --config /root/.llama/config.yaml \
  --port $LLAMA_STACK_PORT \
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -4,24 +4,16 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import sys
 from collections.abc import AsyncIterator
 from datetime import datetime
 from enum import Enum
-from typing import (
+from typing import Annotated, Any, Literal, Protocol, runtime_checkable
    Annotated,
    Any,
    AsyncIterator,
    Dict,
    List,
    Literal,
    Optional,
    Protocol,
    Union,
    runtime_checkable,
 )
 from pydantic import BaseModel, ConfigDict, Field
 from llama_stack.apis.common.content_types import URL, ContentDelta, InterleavedContent
 from llama_stack.apis.common.responses import Order, PaginatedResponse
 from llama_stack.apis.inference import (
    CompletionMessage,
    ResponseFormat,
@ -38,6 +30,23 @@ from llama_stack.apis.safety import SafetyViolation
 from llama_stack.apis.tools import ToolDef
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
 from .openai_responses import (
    ListOpenAIResponseInputItem,
    ListOpenAIResponseObject,
    OpenAIResponseInput,
    OpenAIResponseInputTool,
    OpenAIResponseObject,
    OpenAIResponseObjectStream,
 )
 # TODO: use enum.StrEnum when we drop support for python 3.10
 if sys.version_info >= (3, 11):
    from enum import StrEnum
 else:
    class StrEnum(str, Enum):
        """Backport of StrEnum for Python 3.10 and below."""
 class Attachment(BaseModel):
    """An attachment to an agent turn.
@ -72,11 +81,11 @@ class StepCommon(BaseModel):
    turn_id: str
    step_id: str
-    started_at: Optional[datetime] = None
+    started_at: datetime | None = None
-    completed_at: Optional[datetime] = None
+    completed_at: datetime | None = None
-class StepType(Enum):
+class StepType(StrEnum):
    """Type of the step in an agent turn.
    :cvar inference: The step is an inference step that calls an LLM.
@ -100,7 +109,7 @@ class InferenceStep(StepCommon):
    model_config = ConfigDict(protected_namespaces=())
-    step_type: Literal[StepType.inference.value] = StepType.inference.value
+    step_type: Literal[StepType.inference] = StepType.inference
    model_response: CompletionMessage
@ -112,9 +121,9 @@ class ToolExecutionStep(StepCommon):
    :param tool_responses: The tool responses from the tool calls.
    """
-    step_type: Literal[StepType.tool_execution.value] = StepType.tool_execution.value
+    step_type: Literal[StepType.tool_execution] = StepType.tool_execution
-    tool_calls: List[ToolCall]
+    tool_calls: list[ToolCall]
-    tool_responses: List[ToolResponse]
+    tool_responses: list[ToolResponse]
@json_schema_type
@ -124,8 +133,8 @@ class ShieldCallStep(StepCommon):
    :param violation: The violation from the shield call.
    """
-    step_type: Literal[StepType.shield_call.value] = StepType.shield_call.value
+    step_type: Literal[StepType.shield_call] = StepType.shield_call
-    violation: Optional[SafetyViolation]
+    violation: SafetyViolation | None
@json_schema_type
@ -136,19 +145,14 @@ class MemoryRetrievalStep(StepCommon):
    :param inserted_context: The context retrieved from the vector databases.
    """
-    step_type: Literal[StepType.memory_retrieval.value] = StepType.memory_retrieval.value
+    step_type: Literal[StepType.memory_retrieval] = StepType.memory_retrieval
    # TODO: should this be List[str]?
    vector_db_ids: str
    inserted_context: InterleavedContent
 Step = Annotated[
-    Union[
+    InferenceStep | ToolExecutionStep | ShieldCallStep | MemoryRetrievalStep,
        InferenceStep,
        ToolExecutionStep,
        ShieldCallStep,
        MemoryRetrievalStep,
    ],
    Field(discriminator="step_type"),
 ]
@ -159,18 +163,13 @@ class Turn(BaseModel):
    turn_id: str
    session_id: str
-    input_messages: List[
+    input_messages: list[UserMessage | ToolResponseMessage]
-        Union[
+    steps: list[Step]
            UserMessage,
            ToolResponseMessage,
        ]
    ]
    steps: List[Step]
    output_message: CompletionMessage
-    output_attachments: Optional[List[Attachment]] = Field(default_factory=list)
+    output_attachments: list[Attachment] | None = Field(default_factory=lambda: [])
    started_at: datetime
-    completed_at: Optional[datetime] = None
+    completed_at: datetime | None = None
@json_schema_type
@ -179,34 +178,31 @@ class Session(BaseModel):
    session_id: str
    session_name: str
-    turns: List[Turn]
+    turns: list[Turn]
    started_at: datetime
 class AgentToolGroupWithArgs(BaseModel):
    name: str
-    args: Dict[str, Any]
+    args: dict[str, Any]
-AgentToolGroup = Union[
+AgentToolGroup = str | AgentToolGroupWithArgs
    str,
    AgentToolGroupWithArgs,
 ]
 register_schema(AgentToolGroup, name="AgentTool")
 class AgentConfigCommon(BaseModel):
-    sampling_params: Optional[SamplingParams] = Field(default_factory=SamplingParams)
+    sampling_params: SamplingParams | None = Field(default_factory=SamplingParams)
-    input_shields: Optional[List[str]] = Field(default_factory=list)
+    input_shields: list[str] | None = Field(default_factory=lambda: [])
-    output_shields: Optional[List[str]] = Field(default_factory=list)
+    output_shields: list[str] | None = Field(default_factory=lambda: [])
-    toolgroups: Optional[List[AgentToolGroup]] = Field(default_factory=list)
+    toolgroups: list[AgentToolGroup] | None = Field(default_factory=lambda: [])
-    client_tools: Optional[List[ToolDef]] = Field(default_factory=list)
+    client_tools: list[ToolDef] | None = Field(default_factory=lambda: [])
-    tool_choice: Optional[ToolChoice] = Field(default=None, deprecated="use tool_config instead")
+    tool_choice: ToolChoice | None = Field(default=None, deprecated="use tool_config instead")
-    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None, deprecated="use tool_config instead")
+    tool_prompt_format: ToolPromptFormat | None = Field(default=None, deprecated="use tool_config instead")
-    tool_config: Optional[ToolConfig] = Field(default=None)
+    tool_config: ToolConfig | None = Field(default=None)
-    max_infer_iters: Optional[int] = 10
+    max_infer_iters: int | None = 10
    def model_post_init(self, __context):
        if self.tool_config:
@ -225,10 +221,20 @@ class AgentConfigCommon(BaseModel):
@json_schema_type
 class AgentConfig(AgentConfigCommon):
    """Configuration for an agent.
    :param model: The model identifier to use for the agent
    :param instructions: The system instructions for the agent
    :param name: Optional name for the agent, used in telemetry and identification
    :param enable_session_persistence: Optional flag indicating whether session data has to be persisted
    :param response_format: Optional response format configuration
    """
    model: str
    instructions: str
-    enable_session_persistence: Optional[bool] = False
+    name: str | None = None
-    response_format: Optional[ResponseFormat] = None
+    enable_session_persistence: bool | None = False
    response_format: ResponseFormat | None = None
@json_schema_type
@ -238,21 +244,11 @@ class Agent(BaseModel):
    created_at: datetime
@json_schema_type
 class ListAgentsResponse(BaseModel):
    data: List[Agent]
@json_schema_type
 class ListAgentSessionsResponse(BaseModel):
    data: List[Session]
 class AgentConfigOverridablePerTurn(AgentConfigCommon):
-    instructions: Optional[str] = None
+    instructions: str | None = None
-class AgentTurnResponseEventType(Enum):
+class AgentTurnResponseEventType(StrEnum):
    step_start = "step_start"
    step_complete = "step_complete"
    step_progress = "step_progress"
@ -264,15 +260,15 @@ class AgentTurnResponseEventType(Enum):
@json_schema_type
 class AgentTurnResponseStepStartPayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.step_start.value] = AgentTurnResponseEventType.step_start.value
+    event_type: Literal[AgentTurnResponseEventType.step_start] = AgentTurnResponseEventType.step_start
    step_type: StepType
    step_id: str
-    metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
+    metadata: dict[str, Any] | None = Field(default_factory=lambda: {})
@json_schema_type
 class AgentTurnResponseStepCompletePayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.step_complete.value] = AgentTurnResponseEventType.step_complete.value
+    event_type: Literal[AgentTurnResponseEventType.step_complete] = AgentTurnResponseEventType.step_complete
    step_type: StepType
    step_id: str
    step_details: Step
@ -282,7 +278,7 @@ class AgentTurnResponseStepCompletePayload(BaseModel):
 class AgentTurnResponseStepProgressPayload(BaseModel):
    model_config = ConfigDict(protected_namespaces=())
-    event_type: Literal[AgentTurnResponseEventType.step_progress.value] = AgentTurnResponseEventType.step_progress.value
+    event_type: Literal[AgentTurnResponseEventType.step_progress] = AgentTurnResponseEventType.step_progress
    step_type: StepType
    step_id: str
@ -291,33 +287,29 @@ class AgentTurnResponseStepProgressPayload(BaseModel):
@json_schema_type
 class AgentTurnResponseTurnStartPayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.turn_start.value] = AgentTurnResponseEventType.turn_start.value
+    event_type: Literal[AgentTurnResponseEventType.turn_start] = AgentTurnResponseEventType.turn_start
    turn_id: str
@json_schema_type
 class AgentTurnResponseTurnCompletePayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.turn_complete.value] = AgentTurnResponseEventType.turn_complete.value
+    event_type: Literal[AgentTurnResponseEventType.turn_complete] = AgentTurnResponseEventType.turn_complete
    turn: Turn
@json_schema_type
 class AgentTurnResponseTurnAwaitingInputPayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.turn_awaiting_input.value] = (
+    event_type: Literal[AgentTurnResponseEventType.turn_awaiting_input] = AgentTurnResponseEventType.turn_awaiting_input
        AgentTurnResponseEventType.turn_awaiting_input.value
    )
    turn: Turn
 AgentTurnResponseEventPayload = Annotated[
-    Union[
+    AgentTurnResponseStepStartPayload
-        AgentTurnResponseStepStartPayload,
+    | AgentTurnResponseStepProgressPayload
-        AgentTurnResponseStepProgressPayload,
+    | AgentTurnResponseStepCompletePayload
-        AgentTurnResponseStepCompletePayload,
+    | AgentTurnResponseTurnStartPayload
-        AgentTurnResponseTurnStartPayload,
+    | AgentTurnResponseTurnCompletePayload
-        AgentTurnResponseTurnCompletePayload,
+    | AgentTurnResponseTurnAwaitingInputPayload,
        AgentTurnResponseTurnAwaitingInputPayload,
    ],
    Field(discriminator="event_type"),
 ]
 register_schema(AgentTurnResponseEventPayload, name="AgentTurnResponseEventPayload")
@ -346,18 +338,13 @@ class AgentTurnCreateRequest(AgentConfigOverridablePerTurn):
    # TODO: figure out how we can simplify this and make why
    # ToolResponseMessage needs to be here (it is function call
    # execution from outside the system)
-    messages: List[
+    messages: list[UserMessage | ToolResponseMessage]
        Union[
            UserMessage,
            ToolResponseMessage,
        ]
    ]
-    documents: Optional[List[Document]] = None
+    documents: list[Document] | None = None
-    toolgroups: Optional[List[AgentToolGroup]] = None
+    toolgroups: list[AgentToolGroup] | None = Field(default_factory=lambda: [])
-    stream: Optional[bool] = False
+    stream: bool | None = False
-    tool_config: Optional[ToolConfig] = None
+    tool_config: ToolConfig | None = None
@json_schema_type
@ -365,8 +352,8 @@ class AgentTurnResumeRequest(BaseModel):
    agent_id: str
    session_id: str
    turn_id: str
-    tool_responses: List[ToolResponse]
+    tool_responses: list[ToolResponse]
-    stream: Optional[bool] = False
+    stream: bool | None = False
@json_schema_type
@ -412,17 +399,12 @@ class Agents(Protocol):
        self,
        agent_id: str,
        session_id: str,
-        messages: List[
+        messages: list[UserMessage | ToolResponseMessage],
-            Union[
+        stream: bool | None = False,
-                UserMessage,
+        documents: list[Document] | None = None,
-                ToolResponseMessage,
+        toolgroups: list[AgentToolGroup] | None = None,
-            ]
+        tool_config: ToolConfig | None = None,
-        ],
+    ) -> Turn | AsyncIterator[AgentTurnResponseStreamChunk]:
        stream: Optional[bool] = False,
        documents: Optional[List[Document]] = None,
        toolgroups: Optional[List[AgentToolGroup]] = None,
        tool_config: Optional[ToolConfig] = None,
    ) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]:
        """Create a new turn for an agent.
        :param agent_id: The ID of the agent to create the turn for.
@ -433,8 +415,9 @@ class Agents(Protocol):
        :param toolgroups: (Optional) List of toolgroups to create the turn with, will be used in addition to the agent's config toolgroups for the request.
        :param tool_config: (Optional) The tool configuration to create the turn with, will be used to override the agent's tool_config.
        :returns: If stream=False, returns a Turn object.
-                  If stream=True, returns an SSE event stream of AgentTurnResponseStreamChunk
+                  If stream=True, returns an SSE event stream of AgentTurnResponseStreamChunk.
        """
        ...
    @webmethod(
        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume",
@ -446,9 +429,9 @@ class Agents(Protocol):
        agent_id: str,
        session_id: str,
        turn_id: str,
-        tool_responses: List[ToolResponse],
+        tool_responses: list[ToolResponse],
-        stream: Optional[bool] = False,
+        stream: bool | None = False,
-    ) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]:
+    ) -> Turn | AsyncIterator[AgentTurnResponseStreamChunk]:
        """Resume an agent turn with executed tool call responses.
        When a Turn has the status `awaiting_input` due to pending input from client side tool calls, this endpoint can be used to submit the outputs from the tool calls once they are ready.
@ -521,13 +504,14 @@ class Agents(Protocol):
        self,
        session_id: str,
        agent_id: str,
-        turn_ids: Optional[List[str]] = None,
+        turn_ids: list[str] | None = None,
    ) -> Session:
        """Retrieve an agent session by its ID.
        :param session_id: The ID of the session to get.
        :param agent_id: The ID of the agent to get the session for.
        :param turn_ids: (Optional) List of turn IDs to filter the session by.
        :returns: A Session.
        """
        ...
@ -537,7 +521,7 @@ class Agents(Protocol):
        session_id: str,
        agent_id: str,
    ) -> None:
-        """Delete an agent session by its ID.
+        """Delete an agent session by its ID and its associated turns.
        :param session_id: The ID of the session to delete.
        :param agent_id: The ID of the agent to delete the session for.
@ -549,17 +533,19 @@ class Agents(Protocol):
        self,
        agent_id: str,
    ) -> None:
-        """Delete an agent by its ID.
+        """Delete an agent by its ID and its associated sessions and turns.
        :param agent_id: The ID of the agent to delete.
        """
        ...
    @webmethod(route="/agents", method="GET")
-    async def list_agents(self) -> ListAgentsResponse:
+    async def list_agents(self, start_index: int | None = None, limit: int | None = None) -> PaginatedResponse:
        """List all agents.
-        :returns: A ListAgentsResponse.
+        :param start_index: The index to start the pagination from.
        :param limit: The number of agents to return.
        :returns: A PaginatedResponse.
        """
        ...
@ -576,10 +562,94 @@ class Agents(Protocol):
    async def list_agent_sessions(
        self,
        agent_id: str,
-    ) -> ListAgentSessionsResponse:
+        start_index: int | None = None,
        limit: int | None = None,
    ) -> PaginatedResponse:
        """List all session(s) of a given agent.
        :param agent_id: The ID of the agent to list sessions for.
-        :returns: A ListAgentSessionsResponse.
+        :param start_index: The index to start the pagination from.
        :param limit: The number of sessions to return.
        :returns: A PaginatedResponse.
        """
        ...
    # We situate the OpenAI Responses API in the Agents API just like we did things
    # for Inference. The Responses API, in its intent, serves the same purpose as
    # the Agents API above -- it is essentially a lightweight "agentic loop" with
    # integrated tool calling.
    #
    # Both of these APIs are inherently stateful.
    @webmethod(route="/openai/v1/responses/{response_id}", method="GET")
    async def get_openai_response(
        self,
        response_id: str,
    ) -> OpenAIResponseObject:
        """Retrieve an OpenAI response by its ID.
        :param response_id: The ID of the OpenAI response to retrieve.
        :returns: An OpenAIResponseObject.
        """
        ...
    @webmethod(route="/openai/v1/responses", method="POST")
    async def create_openai_response(
        self,
        input: str | list[OpenAIResponseInput],
        model: str,
        instructions: str | None = None,
        previous_response_id: str | None = None,
        store: bool | None = True,
        stream: bool | None = False,
        temperature: float | None = None,
        tools: list[OpenAIResponseInputTool] | None = None,
    ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
        """Create a new OpenAI response.
        :param input: Input message(s) to create the response.
        :param model: The underlying LLM used for completions.
        :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
        :returns: An OpenAIResponseObject.
        """
        ...
    @webmethod(route="/openai/v1/responses", method="GET")
    async def list_openai_responses(
        self,
        after: str | None = None,
        limit: int | None = 50,
        model: str | None = None,
        order: Order | None = Order.desc,
    ) -> ListOpenAIResponseObject:
        """List all OpenAI responses.
        :param after: The ID of the last response to return.
        :param limit: The number of responses to return.
        :param model: The model to filter responses by.
        :param order: The order to sort responses by when sorted by created_at ('asc' or 'desc').
        :returns: A ListOpenAIResponseObject.
        """
        ...
    @webmethod(route="/openai/v1/responses/{response_id}/input_items", method="GET")
    async def list_openai_response_input_items(
        self,
        response_id: str,
        after: str | None = None,
        before: str | None = None,
        include: list[str] | None = None,
        limit: int | None = 20,
        order: Order | None = Order.desc,
    ) -> ListOpenAIResponseInputItem:
        """List input items for a given OpenAI response.
        :param response_id: The ID of the response to retrieve input items for.
        :param after: An item ID to list items after, used for pagination.
        :param before: An item ID to list items before, used for pagination.
        :param include: Additional fields to include in the response.
        :param limit: A limit on the number of objects to be returned. Limit can range between 1 and 100, and the default is 20.
        :param order: The order to return the input items in. Default is desc.
        :returns: An ListOpenAIResponseInputItem.
        """
        ...
--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@ -0,0 +1,279 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Annotated, Any, Literal
 from pydantic import BaseModel, Field
 from llama_stack.schema_utils import json_schema_type, register_schema
 # NOTE(ashwin): this file is literally a copy of the OpenAI responses API schema. We should probably
 # take their YAML and generate this file automatically. Their YAML is available.
@json_schema_type
 class OpenAIResponseError(BaseModel):
    code: str
    message: str
@json_schema_type
 class OpenAIResponseInputMessageContentText(BaseModel):
    text: str
    type: Literal["input_text"] = "input_text"
@json_schema_type
 class OpenAIResponseInputMessageContentImage(BaseModel):
    detail: Literal["low"] | Literal["high"] | Literal["auto"] = "auto"
    type: Literal["input_image"] = "input_image"
    # TODO: handle file_id
    image_url: str | None = None
 # TODO: handle file content types
 OpenAIResponseInputMessageContent = Annotated[
    OpenAIResponseInputMessageContentText | OpenAIResponseInputMessageContentImage,
    Field(discriminator="type"),
 ]
 register_schema(OpenAIResponseInputMessageContent, name="OpenAIResponseInputMessageContent")
@json_schema_type
 class OpenAIResponseOutputMessageContentOutputText(BaseModel):
    text: str
    type: Literal["output_text"] = "output_text"
 OpenAIResponseOutputMessageContent = Annotated[
    OpenAIResponseOutputMessageContentOutputText,
    Field(discriminator="type"),
 ]
 register_schema(OpenAIResponseOutputMessageContent, name="OpenAIResponseOutputMessageContent")
@json_schema_type
 class OpenAIResponseMessage(BaseModel):
    """
    Corresponds to the various Message types in the Responses API.
    They are all under one type because the Responses API gives them all
    the same "type" value, and there is no way to tell them apart in certain
    scenarios.
    """
    content: str | list[OpenAIResponseInputMessageContent] | list[OpenAIResponseOutputMessageContent]
    role: Literal["system"] | Literal["developer"] | Literal["user"] | Literal["assistant"]
    type: Literal["message"] = "message"
    # The fields below are not used in all scenarios, but are required in others.
    id: str | None = None
    status: str | None = None
@json_schema_type
 class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
    id: str
    status: str
    type: Literal["web_search_call"] = "web_search_call"
@json_schema_type
 class OpenAIResponseOutputMessageFunctionToolCall(BaseModel):
    call_id: str
    name: str
    arguments: str
    type: Literal["function_call"] = "function_call"
    id: str | None = None
    status: str | None = None
@json_schema_type
 class OpenAIResponseOutputMessageMCPCall(BaseModel):
    id: str
    type: Literal["mcp_call"] = "mcp_call"
    arguments: str
    name: str
    server_label: str
    error: str | None = None
    output: str | None = None
 class MCPListToolsTool(BaseModel):
    input_schema: dict[str, Any]
    name: str
    description: str | None = None
@json_schema_type
 class OpenAIResponseOutputMessageMCPListTools(BaseModel):
    id: str
    type: Literal["mcp_list_tools"] = "mcp_list_tools"
    server_label: str
    tools: list[MCPListToolsTool]
 OpenAIResponseOutput = Annotated[
    OpenAIResponseMessage
    | OpenAIResponseOutputMessageWebSearchToolCall
    | OpenAIResponseOutputMessageFunctionToolCall
    | OpenAIResponseOutputMessageMCPCall
    | OpenAIResponseOutputMessageMCPListTools,
    Field(discriminator="type"),
 ]
 register_schema(OpenAIResponseOutput, name="OpenAIResponseOutput")
@json_schema_type
 class OpenAIResponseObject(BaseModel):
    created_at: int
    error: OpenAIResponseError | None = None
    id: str
    model: str
    object: Literal["response"] = "response"
    output: list[OpenAIResponseOutput]
    parallel_tool_calls: bool = False
    previous_response_id: str | None = None
    status: str
    temperature: float | None = None
    top_p: float | None = None
    truncation: str | None = None
    user: str | None = None
@json_schema_type
 class OpenAIResponseObjectStreamResponseCreated(BaseModel):
    response: OpenAIResponseObject
    type: Literal["response.created"] = "response.created"
@json_schema_type
 class OpenAIResponseObjectStreamResponseOutputTextDelta(BaseModel):
    content_index: int
    delta: str
    item_id: str
    output_index: int
    sequence_number: int
    type: Literal["response.output_text.delta"] = "response.output_text.delta"
@json_schema_type
 class OpenAIResponseObjectStreamResponseCompleted(BaseModel):
    response: OpenAIResponseObject
    type: Literal["response.completed"] = "response.completed"
 OpenAIResponseObjectStream = Annotated[
    OpenAIResponseObjectStreamResponseCreated
    | OpenAIResponseObjectStreamResponseOutputTextDelta
    | OpenAIResponseObjectStreamResponseCompleted,
    Field(discriminator="type"),
 ]
 register_schema(OpenAIResponseObjectStream, name="OpenAIResponseObjectStream")
@json_schema_type
 class OpenAIResponseInputFunctionToolCallOutput(BaseModel):
    """
    This represents the output of a function call that gets passed back to the model.
    """
    call_id: str
    output: str
    type: Literal["function_call_output"] = "function_call_output"
    id: str | None = None
    status: str | None = None
 OpenAIResponseInput = Annotated[
    # Responses API allows output messages to be passed in as input
    OpenAIResponseOutputMessageWebSearchToolCall
    | OpenAIResponseOutputMessageFunctionToolCall
    | OpenAIResponseInputFunctionToolCallOutput
    |
    # Fallback to the generic message type as a last resort
    OpenAIResponseMessage,
    Field(union_mode="left_to_right"),
 ]
 register_schema(OpenAIResponseInput, name="OpenAIResponseInput")
@json_schema_type
 class OpenAIResponseInputToolWebSearch(BaseModel):
    type: Literal["web_search"] | Literal["web_search_preview_2025_03_11"] = "web_search"
    # TODO: actually use search_context_size somewhere...
    search_context_size: str | None = Field(default="medium", pattern="^low|medium|high$")
    # TODO: add user_location
@json_schema_type
 class OpenAIResponseInputToolFunction(BaseModel):
    type: Literal["function"] = "function"
    name: str
    description: str | None = None
    parameters: dict[str, Any] | None
    strict: bool | None = None
 class FileSearchRankingOptions(BaseModel):
    ranker: str | None = None
    score_threshold: float | None = Field(default=0.0, ge=0.0, le=1.0)
@json_schema_type
 class OpenAIResponseInputToolFileSearch(BaseModel):
    type: Literal["file_search"] = "file_search"
    vector_store_id: list[str]
    ranking_options: FileSearchRankingOptions | None = None
    # TODO: add filters
 class ApprovalFilter(BaseModel):
    always: list[str] | None = None
    never: list[str] | None = None
 class AllowedToolsFilter(BaseModel):
    tool_names: list[str] | None = None
@json_schema_type
 class OpenAIResponseInputToolMCP(BaseModel):
    type: Literal["mcp"] = "mcp"
    server_label: str
    server_url: str
    headers: dict[str, Any] | None = None
    require_approval: Literal["always"] | Literal["never"] | ApprovalFilter = "never"
    allowed_tools: list[str] | AllowedToolsFilter | None = None
 OpenAIResponseInputTool = Annotated[
    OpenAIResponseInputToolWebSearch
    | OpenAIResponseInputToolFileSearch
    | OpenAIResponseInputToolFunction
    | OpenAIResponseInputToolMCP,
    Field(discriminator="type"),
 ]
 register_schema(OpenAIResponseInputTool, name="OpenAIResponseInputTool")
 class ListOpenAIResponseInputItem(BaseModel):
    data: list[OpenAIResponseInput]
    object: Literal["list"] = "list"
@json_schema_type
 class OpenAIResponseObjectWithInput(OpenAIResponseObject):
    input: list[OpenAIResponseInput]
@json_schema_type
 class ListOpenAIResponseObject(BaseModel):
    data: list[OpenAIResponseObjectWithInput]
    has_more: bool
    first_id: str
    last_id: str
    object: Literal["list"] = "list"
--- a/llama_stack/apis/batch_inference/batch_inference.py
+++ b/llama_stack/apis/batch_inference/batch_inference.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import List, Optional, Protocol, runtime_checkable
+from typing import Protocol, runtime_checkable
 from llama_stack.apis.common.job_types import Job
 from llama_stack.apis.inference import (
@ -34,22 +34,45 @@ class BatchInference(Protocol):
    async def completion(
        self,
        model: str,
-        content_batch: List[InterleavedContent],
+        content_batch: list[InterleavedContent],
-        sampling_params: Optional[SamplingParams] = None,
+        sampling_params: SamplingParams | None = None,
-        response_format: Optional[ResponseFormat] = None,
+        response_format: ResponseFormat | None = None,
-        logprobs: Optional[LogProbConfig] = None,
+        logprobs: LogProbConfig | None = None,
-    ) -> Job: ...
+    ) -> Job:
        """Generate completions for a batch of content.
        :param model: The model to use for the completion.
        :param content_batch: The content to complete.
        :param sampling_params: The sampling parameters to use for the completion.
        :param response_format: The response format to use for the completion.
        :param logprobs: The logprobs to use for the completion.
        :returns: A job for the completion.
        """
        ...
    @webmethod(route="/batch-inference/chat-completion", method="POST")
    async def chat_completion(
        self,
        model: str,
-        messages_batch: List[List[Message]],
+        messages_batch: list[list[Message]],
-        sampling_params: Optional[SamplingParams] = None,
+        sampling_params: SamplingParams | None = None,
        # zero-shot tool definitions as input to the model
-        tools: Optional[List[ToolDefinition]] = None,
+        tools: list[ToolDefinition] | None = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
+        tool_prompt_format: ToolPromptFormat | None = None,
-        response_format: Optional[ResponseFormat] = None,
+        response_format: ResponseFormat | None = None,
-        logprobs: Optional[LogProbConfig] = None,
+        logprobs: LogProbConfig | None = None,
-    ) -> Job: ...
+    ) -> Job:
        """Generate chat completions for a batch of messages.
        :param model: The model to use for the chat completion.
        :param messages_batch: The messages to complete.
        :param sampling_params: The sampling parameters to use for the completion.
        :param tools: The tools to use for the chat completion.
        :param tool_choice: The tool choice to use for the chat completion.
        :param tool_prompt_format: The tool prompt format to use for the chat completion.
        :param response_format: The response format to use for the chat completion.
        :param logprobs: The logprobs to use for the chat completion.
        :returns: A job for the chat completion.
        """
        ...
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
+from typing import Any, Literal, Protocol, runtime_checkable
 from pydantic import BaseModel, Field
@ -13,8 +13,8 @@ from llama_stack.schema_utils import json_schema_type, webmethod
 class CommonBenchmarkFields(BaseModel):
    dataset_id: str
-    scoring_functions: List[str]
+    scoring_functions: list[str]
-    metadata: Dict[str, Any] = Field(
+    metadata: dict[str, Any] = Field(
        default_factory=dict,
        description="Metadata for this evaluation task",
    )
@ -22,45 +22,66 @@ class CommonBenchmarkFields(BaseModel):
@json_schema_type
 class Benchmark(CommonBenchmarkFields, Resource):
-    type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value
+    type: Literal[ResourceType.benchmark] = ResourceType.benchmark
    @property
    def benchmark_id(self) -> str:
        return self.identifier
    @property
-    def provider_benchmark_id(self) -> str:
+    def provider_benchmark_id(self) -> str | None:
        return self.provider_resource_id
 class BenchmarkInput(CommonBenchmarkFields, BaseModel):
    benchmark_id: str
-    provider_id: Optional[str] = None
+    provider_id: str | None = None
-    provider_benchmark_id: Optional[str] = None
+    provider_benchmark_id: str | None = None
 class ListBenchmarksResponse(BaseModel):
-    data: List[Benchmark]
+    data: list[Benchmark]
@runtime_checkable
 class Benchmarks(Protocol):
    @webmethod(route="/eval/benchmarks", method="GET")
-    async def list_benchmarks(self) -> ListBenchmarksResponse: ...
+    async def list_benchmarks(self) -> ListBenchmarksResponse:
        """List all benchmarks.
        :returns: A ListBenchmarksResponse.
        """
        ...
    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET")
    async def get_benchmark(
        self,
        benchmark_id: str,
-    ) -> Benchmark: ...
+    ) -> Benchmark:
        """Get a benchmark by its ID.
        :param benchmark_id: The ID of the benchmark to get.
        :returns: A Benchmark.
        """
        ...
    @webmethod(route="/eval/benchmarks", method="POST")
    async def register_benchmark(
        self,
        benchmark_id: str,
        dataset_id: str,
-        scoring_functions: List[str],
+        scoring_functions: list[str],
-        provider_benchmark_id: Optional[str] = None,
+        provider_benchmark_id: str | None = None,
-        provider_id: Optional[str] = None,
+        provider_id: str | None = None,
-        metadata: Optional[Dict[str, Any]] = None,
+        metadata: dict[str, Any] | None = None,
-    ) -> None: ...
+    ) -> None:
        """Register a benchmark.
        :param benchmark_id: The ID of the benchmark to register.
        :param dataset_id: The ID of the dataset to use for the benchmark.
        :param scoring_functions: The scoring functions to use for the benchmark.
        :param provider_benchmark_id: The ID of the provider benchmark to use for the benchmark.
        :param provider_id: The ID of the provider to use for the benchmark.
        :param metadata: The metadata to use for the benchmark.
        """
        ...
--- a/llama_stack/apis/common/content_types.py
+++ b/llama_stack/apis/common/content_types.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.
 from enum import Enum
-from typing import Annotated, List, Literal, Optional, Union
+from typing import Annotated, Literal
 from pydantic import BaseModel, Field, model_validator
@ -26,9 +26,9 @@ class _URLOrData(BaseModel):
    :param data: base64 encoded image data as string
    """
-    url: Optional[URL] = None
+    url: URL | None = None
    # data is a base64 encoded string, hint with contentEncoding=base64
-    data: Optional[str] = Field(contentEncoding="base64", default=None)
+    data: str | None = Field(default=None, json_schema_extra={"contentEncoding": "base64"})
    @model_validator(mode="before")
    @classmethod
@ -64,13 +64,13 @@ class TextContentItem(BaseModel):
 # other modalities can be added here
 InterleavedContentItem = Annotated[
-    Union[ImageContentItem, TextContentItem],
+    ImageContentItem | TextContentItem,
    Field(discriminator="type"),
 ]
 register_schema(InterleavedContentItem, name="InterleavedContentItem")
 # accept a single "str" as a special case since it is common
-InterleavedContent = Union[str, InterleavedContentItem, List[InterleavedContentItem]]
+InterleavedContent = str | InterleavedContentItem | list[InterleavedContentItem]
 register_schema(InterleavedContent, name="InterleavedContent")
@ -100,13 +100,13 @@ class ToolCallDelta(BaseModel):
    # you either send an in-progress tool call so the client can stream a long
    # code generation or you send the final parsed tool call at the end of the
    # stream
-    tool_call: Union[str, ToolCall]
+    tool_call: str | ToolCall
    parse_status: ToolCallParseStatus
 # streaming completions send a stream of ContentDeltas
 ContentDelta = Annotated[
-    Union[TextDelta, ImageDelta, ToolCallDelta],
+    TextDelta | ImageDelta | ToolCallDelta,
    Field(discriminator="type"),
 ]
 register_schema(ContentDelta, name="ContentDelta")
--- a/llama_stack/apis/common/deployment_types.py
+++ b/llama_stack/apis/common/deployment_types.py
@ -1,30 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from enum import Enum
 from typing import Any, Dict, Optional
 from pydantic import BaseModel
 from llama_stack.apis.common.content_types import URL
 from llama_stack.schema_utils import json_schema_type
@json_schema_type
 class RestAPIMethod(Enum):
    GET = "GET"
    POST = "POST"
    PUT = "PUT"
    DELETE = "DELETE"
@json_schema_type
 class RestAPIExecutionConfig(BaseModel):
    url: URL
    method: RestAPIMethod
    params: Optional[Dict[str, Any]] = None
    headers: Optional[Dict[str, Any]] = None
    body: Optional[Dict[str, Any]] = None
--- a/llama_stack/apis/common/responses.py
+++ b/llama_stack/apis/common/responses.py
@ -4,13 +4,19 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict, List
+from enum import Enum
 from typing import Any
 from pydantic import BaseModel
 from llama_stack.schema_utils import json_schema_type
 class Order(Enum):
    asc = "asc"
    desc = "desc"
@json_schema_type
 class PaginatedResponse(BaseModel):
    """A generic paginated response that follows a simple format.
@ -19,5 +25,5 @@ class PaginatedResponse(BaseModel):
    :param has_more: Whether there are more items available after this set
    """
-    data: List[Dict[str, Any]]
+    data: list[dict[str, Any]]
    has_more: bool
--- a/llama_stack/apis/common/training_types.py
+++ b/llama_stack/apis/common/training_types.py
@ -5,7 +5,6 @@
 # the root directory of this source tree.
 from datetime import datetime
 from typing import Optional
 from pydantic import BaseModel
@ -27,4 +26,4 @@ class Checkpoint(BaseModel):
    epoch: int
    post_training_job_id: str
    path: str
-    training_metrics: Optional[PostTrainingMetric] = None
+    training_metrics: PostTrainingMetric | None = None
--- a/llama_stack/apis/common/type_system.py
+++ b/llama_stack/apis/common/type_system.py
@ -4,10 +4,9 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Literal, Union
+from typing import Annotated, Literal
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
 from llama_stack.schema_utils import json_schema_type, register_schema
@ -73,18 +72,16 @@ class DialogType(BaseModel):
 ParamType = Annotated[
-    Union[
+    StringType
-        StringType,
+    | NumberType
-        NumberType,
+    | BooleanType
-        BooleanType,
+    | ArrayType
-        ArrayType,
+    | ObjectType
-        ObjectType,
+    | JsonType
-        JsonType,
+    | UnionType
-        UnionType,
+    | ChatCompletionInputType
-        ChatCompletionInputType,
+    | CompletionInputType
-        CompletionInputType,
+    | AgentTurnInputType,
        AgentTurnInputType,
    ],
    Field(discriminator="type"),
 ]
 register_schema(ParamType, name="ParamType")
--- a/llama_stack/apis/datasetio/datasetio.py
+++ b/llama_stack/apis/datasetio/datasetio.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
+from typing import Any, Protocol, runtime_checkable
 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.datasets import Dataset
@ -24,8 +24,8 @@ class DatasetIO(Protocol):
    async def iterrows(
        self,
        dataset_id: str,
-        start_index: Optional[int] = None,
+        start_index: int | None = None,
-        limit: Optional[int] = None,
+        limit: int | None = None,
    ) -> PaginatedResponse:
        """Get a paginated list of rows from a dataset.
@ -34,14 +34,21 @@ class DatasetIO(Protocol):
        - limit: Number of items to return. If None or -1, returns all items.
        The response includes:
-        - data: List of items for the current page
+        - data: List of items for the current page.
-        - has_more: Whether there are more items available after this set
+        - has_more: Whether there are more items available after this set.
        :param dataset_id: The ID of the dataset to get the rows from.
        :param start_index: Index into dataset for the first row to get. Get all rows if None.
        :param limit: The number of rows to get.
        :returns: A PaginatedResponse.
        """
        ...
    @webmethod(route="/datasetio/append-rows/{dataset_id:path}", method="POST")
-    async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: ...
+    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
        """Append rows to a dataset.
        :param dataset_id: The ID of the dataset to append the rows to.
        :param rows: The rows to append to the dataset.
        """
        ...
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.
 from enum import Enum
-from typing import Annotated, Any, Dict, List, Literal, Optional, Protocol, Union
+from typing import Annotated, Any, Literal, Protocol
 from pydantic import BaseModel, Field
@ -81,11 +81,11 @@ class RowsDataSource(BaseModel):
    """
    type: Literal["rows"] = "rows"
-    rows: List[Dict[str, Any]]
+    rows: list[dict[str, Any]]
 DataSource = Annotated[
-    Union[URIDataSource, RowsDataSource],
+    URIDataSource | RowsDataSource,
    Field(discriminator="type"),
 ]
 register_schema(DataSource, name="DataSource")
@ -98,7 +98,7 @@ class CommonDatasetFields(BaseModel):
    purpose: DatasetPurpose
    source: DataSource
-    metadata: Dict[str, Any] = Field(
+    metadata: dict[str, Any] = Field(
        default_factory=dict,
        description="Any additional metadata for this dataset",
    )
@ -106,14 +106,14 @@ class CommonDatasetFields(BaseModel):
@json_schema_type
 class Dataset(CommonDatasetFields, Resource):
-    type: Literal[ResourceType.dataset.value] = ResourceType.dataset.value
+    type: Literal[ResourceType.dataset] = ResourceType.dataset
    @property
    def dataset_id(self) -> str:
        return self.identifier
    @property
-    def provider_dataset_id(self) -> str:
+    def provider_dataset_id(self) -> str | None:
        return self.provider_resource_id
@ -122,7 +122,7 @@ class DatasetInput(CommonDatasetFields, BaseModel):
 class ListDatasetsResponse(BaseModel):
-    data: List[Dataset]
+    data: list[Dataset]
 class Datasets(Protocol):
@ -131,13 +131,14 @@ class Datasets(Protocol):
        self,
        purpose: DatasetPurpose,
        source: DataSource,
-        metadata: Optional[Dict[str, Any]] = None,
+        metadata: dict[str, Any] | None = None,
-        dataset_id: Optional[str] = None,
+        dataset_id: str | None = None,
    ) -> Dataset:
        """
        Register a new dataset.
-        :param purpose: The purpose of the dataset. One of
+        :param purpose: The purpose of the dataset.
        One of:
            - "post-training/messages": The dataset contains a messages column with list of messages for post-training.
                {
                    "messages": [
@ -188,8 +189,9 @@ class Datasets(Protocol):
               ]
           }
        :param metadata: The metadata for the dataset.
-           - E.g. {"description": "My dataset"}
+           - E.g. {"description": "My dataset"}.
        :param dataset_id: The ID of the dataset. If not provided, an ID will be generated.
        :returns: A Dataset.
        """
        ...
@ -197,13 +199,29 @@ class Datasets(Protocol):
    async def get_dataset(
        self,
        dataset_id: str,
-    ) -> Dataset: ...
+    ) -> Dataset:
        """Get a dataset by its ID.
        :param dataset_id: The ID of the dataset to get.
        :returns: A Dataset.
        """
        ...
    @webmethod(route="/datasets", method="GET")
-    async def list_datasets(self) -> ListDatasetsResponse: ...
+    async def list_datasets(self) -> ListDatasetsResponse:
        """List all datasets.
        :returns: A ListDatasetsResponse.
        """
        ...
    @webmethod(route="/datasets/{dataset_id:path}", method="DELETE")
    async def unregister_dataset(
        self,
        dataset_id: str,
-    ) -> None: ...
+    ) -> None:
        """Unregister a dataset by its ID.
        :param dataset_id: The ID of the dataset to unregister.
        """
        ...
--- a/Show more
+++ b/Show more
`@ -1,2 +1,2 @@`
	`# This file documents Triage members in the Llama Stack community`	`# This file documents Triage members in the Llama Stack community`
	`@franciscojavierarceo @leseb`	`@bbrowning @booxter @franciscojavierarceo @leseb`