fix security update

fix install
add requirements
2025-06-03 20:07:06 +02:00 · 2025-06-02 03:02:25 +02:00 · 2025-06-02 03:01:15 +02:00 · 2025-06-02 02:54:55 +02:00 · 2025-06-02 02:49:54 +02:00 · 2025-06-02 02:49:45 +02:00
961 changed files with 299640 additions and 69715 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -0,0 +1,6 @@
+[run]
+omit =
+    */tests/*
+    */llama_stack/providers/*
+    */llama_stack/templates/*
+    .venv/*
--- a/.cursor/rules/general.mdc
+++ b/.cursor/rules/general.mdc
@ -1,9 +0,0 @@
---
-description: General rules always applicable across the project
-globs:
-alwaysApply: true
---
-# Style
-
- Comments must add value to code. Don't write filler comments explaining what you are doing next; they just add noise.
- Add a comment to clarify surprising behavior which would not be obvious. Good variable naming and clear code organization is more important.
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@

 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan @SLR722
+* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -1,10 +1,8 @@
 # What does this PR do?
-[Provide a short summary of what this PR does and why. Link to relevant issues if applicable.]
+<!-- Provide a short summary of what this PR does and why. Link to relevant issues if applicable. -->

-[//]: # (If resolving an issue, uncomment and update the line below)
-[//]: # (Closes #[issue-number])
+<!-- If resolving an issue, uncomment and update the line below -->
+<!-- Closes #[issue-number] -->

 ## Test Plan
-[Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.*]
-
-[//]: # (## Documentation)
+<!-- Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.* -->
--- a/.github/TRIAGERS.md
+++ b/.github/TRIAGERS.md
@ -0,0 +1,2 @@
+# This file documents Triage members in the Llama Stack community
+ @bbrowning @booxter @franciscojavierarceo @leseb
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@ -0,0 +1,26 @@
+name: Setup Ollama
+description: Start Ollama and cache model
+inputs:
+  models:
+    description: Comma-separated list of models to pull
+    default: "llama3.2:3b-instruct-fp16,all-minilm:latest"
+runs:
+  using: "composite"
+  steps:
+    - name: Install and start Ollama
+      shell: bash
+      run: |
+        # the ollama installer also starts the ollama service
+        curl -fsSL https://ollama.com/install.sh | sh
+
+    # Do NOT cache models - pulling the cache is actually slower than just pulling the model.
+    # It takes ~45 seconds to pull the models from the cache and unpack it, but only 30 seconds to
+    # pull them directly.
+    # Maybe this is because the cache is being pulled at the same time by all the matrix jobs?
+    - name: Pull requested models
+      if: inputs.models != ''
+      shell: bash
+      run: |
+        for model in $(echo "${{ inputs.models }}" | tr ',' ' '); do
+          ollama pull "$model"
+        done
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -0,0 +1,22 @@
+name: Setup runner
+description: Prepare a runner for the tests (install uv, python, project dependencies, etc.)
+runs:
+  using: "composite"
+  steps:
+    - name: Install uv
+      uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
+      with:
+        python-version: "3.10"
+        activate-environment: true
+        version: 0.7.6
+
+    - name: Install dependencies
+      shell: bash
+      run: |
+        uv sync --all-groups
+        uv pip install ollama faiss-cpu
+        # always test against the latest version of the client
+        # TODO: this is not necessarily a good idea. we need to test against both published and latest
+        # to find out backwards compatibility issues.
+        uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
+        uv pip install -e .
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -5,4 +5,19 @@ updates:
  - package-ecosystem: "github-actions"
    directory: "/" # Will use the default workflow location of `.github/workflows`
    schedule:
-      interval: "daily"
+      interval: "weekly"
+      day: "saturday"
+    commit-message:
+      prefix: chore(github-deps)
+  - package-ecosystem: "uv"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+      day: "saturday"
+    # ignore all non-security updates: https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#open-pull-requests-limit
+    open-pull-requests-limit: 0
+    labels:
+      - type/dependencies
+      - python
+    commit-message:
+      prefix: chore(python-deps)
--- a/.github/workflows/Dockerfile
+++ b/.github/workflows/Dockerfile
@ -0,0 +1 @@
+FROM localhost:5000/distribution-kvant:dev
--- a/.github/workflows/ci-playground.yaml
+++ b/.github/workflows/ci-playground.yaml
@ -0,0 +1,73 @@
+name: Build and Push playground container
+run-name: Build and Push playground container
+on:
+  workflow_dispatch:
+  #schedule:
+  #  - cron: "0 10 * * *"
+  push:
+    branches:
+      - main
+      - kvant
+    tags:
+      - 'v*'
+  pull_request:
+    branches:
+      - main
+      - kvant
+env:
+  IMAGE: git.kvant.cloud/${{github.repository}}-playground
+jobs:
+  build-playground:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set current time
+        uses: https://github.com/gerred/actions/current-time@master
+        id: current_time
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to git.kvant.cloud registry
+        uses: docker/login-action@v3
+        with:
+          registry: git.kvant.cloud
+          username: ${{ vars.ORG_PACKAGE_WRITER_USERNAME }}
+          password: ${{ secrets.ORG_PACKAGE_WRITER_TOKEN }}
+      
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          # list of Docker images to use as base name for tags
+          images: |
+            ${{env.IMAGE}}
+          # generate Docker tags based on the following events/attributes
+          tags: |
+            type=schedule
+            type=ref,event=branch
+            type=ref,event=pr
+            type=ref,event=tag
+            type=semver,pattern={{version}}
+
+      - name: Build and push to gitea registry
+        uses: docker/build-push-action@v6
+        with:
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          context: .
+          file: llama_stack/distribution/ui/Containerfile
+          provenance: mode=max
+          sbom: true
+          build-args: |
+            BUILD_DATE=${{ steps.current_time.outputs.time }}
+          cache-from: |
+            type=registry,ref=${{ env.IMAGE }}:buildcache
+            type=registry,ref=${{ env.IMAGE }}:${{ github.ref_name }}
+            type=registry,ref=${{ env.IMAGE }}:main
+          cache-to: type=registry,ref=${{ env.IMAGE }}:buildcache,mode=max,image-manifest=true
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@ -0,0 +1,98 @@
+name: Build and Push container
+run-name: Build and Push container
+on:
+  workflow_dispatch:
+  #schedule:
+  #  - cron: "0 10 * * *"
+  push:
+    branches:
+      - main
+      - kvant
+    tags:
+      - 'v*'
+  pull_request:
+    branches:
+      - main
+      - kvant
+env:
+  IMAGE: git.kvant.cloud/${{github.repository}}
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    services:
+      registry:
+        image: registry:2
+        ports:
+          - 5000:5000
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set current time
+        uses: https://github.com/gerred/actions/current-time@master
+        id: current_time
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+        with:
+          driver-opts: network=host
+
+      - name: Login to git.kvant.cloud registry
+        uses: docker/login-action@v3
+        with:
+          registry: git.kvant.cloud
+          username: ${{ vars.ORG_PACKAGE_WRITER_USERNAME }}
+          password: ${{ secrets.ORG_PACKAGE_WRITER_TOKEN }}
+      
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          # list of Docker images to use as base name for tags
+          images: |
+            ${{env.IMAGE}}
+          # generate Docker tags based on the following events/attributes
+          tags: |
+            type=schedule
+            type=ref,event=branch
+            type=ref,event=pr
+            type=ref,event=tag
+            type=semver,pattern={{version}}
+
+      - name: Install uv
+        uses: https://github.com/astral-sh/setup-uv@v5
+        with:
+          # Install a specific version of uv.
+          version: "0.7.8"
+            
+      - name: Build
+        env:
+          USE_COPY_NOT_MOUNT: true
+          LLAMA_STACK_DIR: .
+        run: |
+          uvx --from . llama stack build --template kvant --image-type container
+
+          # docker tag distribution-kvant:dev ${{env.IMAGE}}:kvant
+          # docker push ${{env.IMAGE}}:kvant
+
+          docker tag distribution-kvant:dev localhost:5000/distribution-kvant:dev
+          docker push localhost:5000/distribution-kvant:dev
+
+      - name: Build and push to gitea registry
+        uses: docker/build-push-action@v6
+        with:
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          context: .github/workflows
+          provenance: mode=max
+          sbom: true
+          build-args: |
+            BUILD_DATE=${{ steps.current_time.outputs.time }}
+          cache-from: |
+            type=registry,ref=${{ env.IMAGE }}:buildcache
+            type=registry,ref=${{ env.IMAGE }}:${{ github.ref_name }}
+            type=registry,ref=${{ env.IMAGE }}:main
+          cache-to: type=registry,ref=${{ env.IMAGE }}:buildcache,mode=max,image-manifest=true
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -1,29 +0,0 @@
-name: Pre-commit
-
-on:
-  pull_request:
-  push:
-    branches: [main]
-
-jobs:
-  pre-commit:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-          cache: pip
-          cache-dependency-path: |
-            **/requirements*.txt
-            .pre-commit-config.yaml
-
-      - uses: pre-commit/action@v3.0.1
-
-      - name: Verify if there are any diff files after pre-commit
-        run: |
-          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@ -1,36 +0,0 @@
-name: Unit Tests
-
-on:
-  pull_request:
-    branches: [ main ]
-  workflow_dispatch:
-
-jobs:
-  unit-tests:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10.16'
-
-      - uses: astral-sh/setup-uv@v5
-        with:
-          python-version: '3.10.16'
-          enable-cache: false
-
-      - name: Run unit tests
-        run: |
-          uv run -p 3.10.16 --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --cov=llama_stack -s -v tests/unit/ --junitxml=pytest-report.xml
-
-      - name: Upload test results
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: test-results
-          path: |
-            .pytest_cache/
-            pytest-report.xml
-          retention-days: 7
--- a/.github/workflows_upstream/changelog.yml
+++ b/.github/workflows_upstream/changelog.yml
@ -0,0 +1,29 @@
+name: Update Changelog
+
+on:
+  release:
+    types: [published, unpublished, created, edited, deleted, released]
+
+permissions:
+  contents: read
+
+jobs:
+  generate_changelog:
+    name: Generate changelog
+    permissions:
+      contents: write  # for peter-evans/create-pull-request to create branch
+      pull-requests: write  # for peter-evans/create-pull-request to create a PR
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          ref: main
+          fetch-depth: 0
+      - run: |
+          python ./scripts/gen-changelog.py
+      - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
+        with:
+          title: 'docs: update CHANGELOG.md for ${{ github.ref_name }}'
+          commit-message: 'docs: update CHANGELOG.md for ${{ github.ref_name }}'
+          branch: create-pull-request/changelog
+          signoff: true
--- a/.github/workflows_upstream/gha_workflow_llama_stack_tests.yml
+++ b/.github/workflows_upstream/gha_workflow_llama_stack_tests.yml
@ -140,7 +140,7 @@ jobs:
      #######################
      - name: "Checkout 'meta-llama/llama-stack' repository"
        id: checkout_repo
-        uses: actions/checkout@v4
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          ref: ${{ inputs.branch }}

@ -302,7 +302,7 @@ jobs:
      - name: "PR - Test Summary"
        id: pr_test_summary_create
        if: github.event_name == 'pull_request_target'
-        uses: test-summary/action@v2
+        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
        with:
          paths: "${{ github.workspace }}/merged-test-results.xml"
          output: test-summary.md
@ -310,7 +310,7 @@ jobs:
      - name: "PR - Upload Test Summary"
        id: pr_test_summary_upload
        if: github.event_name == 'pull_request_target'
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
          name: test-summary
          path: test-summary.md
@ -320,7 +320,7 @@ jobs:
      - name: "PR - Update comment"
        id: pr_update_comment
        if: github.event_name == 'pull_request_target'
-        uses: thollander/actions-comment-pull-request@v3
+        uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b # v3.0.1
        with:
          filePath: test-summary.md

@ -350,6 +350,6 @@ jobs:
      - name: "Manual - Test Summary"
        id: manual_test_summary
        if: always() && github.event_name == 'workflow_dispatch'
-        uses: test-summary/action@v2
+        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
        with:
          paths: "${{ github.workspace }}/merged-test-results.xml"
--- a/.github/workflows_upstream/install-script-ci.yml
+++ b/.github/workflows_upstream/install-script-ci.yml
@ -0,0 +1,26 @@
+name: Installer CI
+
+on:
+  pull_request:
+    paths:
+      - 'install.sh'
+  push:
+    paths:
+      - 'install.sh'
+  schedule:
+    - cron: '0 2 * * *'  # every day at 02:00 UTC
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
+      - name: Run ShellCheck on install.sh
+        run: shellcheck install.sh
+  smoke-test:
+    needs: lint
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
+      - name: Run installer end-to-end
+        run: ./install.sh
--- a/.github/workflows_upstream/integration-auth-tests.yml
+++ b/.github/workflows_upstream/integration-auth-tests.yml
@ -0,0 +1,132 @@
+name: Integration Auth Tests
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'distributions/**'
+      - 'llama_stack/**'
+      - 'tests/integration/**'
+      - 'uv.lock'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - '.github/workflows/integration-auth-tests.yml' # This workflow
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test-matrix:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        auth-provider: [oauth2_token]
+      fail-fast: false # we want to run all tests regardless of failure
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
+
+      - name: Build Llama Stack
+        run: |
+          llama stack build --template ollama --image-type venv
+
+      - name: Install minikube
+        if: ${{ matrix.auth-provider == 'kubernetes' }}
+        uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # v0.0.19
+
+      - name: Start minikube
+        if: ${{ matrix.auth-provider == 'oauth2_token' }}
+        run: |
+          minikube start
+          kubectl get pods -A
+
+      - name: Configure Kube Auth
+        if: ${{ matrix.auth-provider == 'oauth2_token' }}
+        run: |
+          kubectl create namespace llama-stack
+          kubectl create serviceaccount llama-stack-auth -n llama-stack
+          kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --serviceaccount=llama-stack:llama-stack-auth -n llama-stack
+          kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
+          cat <<EOF | kubectl apply -f -
+          apiVersion: rbac.authorization.k8s.io/v1
+          kind: ClusterRole
+          metadata:
+            name: allow-anonymous-openid
+          rules:
+          - nonResourceURLs: ["/openid/v1/jwks"]
+            verbs: ["get"]
+          ---
+          apiVersion: rbac.authorization.k8s.io/v1
+          kind: ClusterRoleBinding
+          metadata:
+            name: allow-anonymous-openid
+          roleRef:
+            apiGroup: rbac.authorization.k8s.io
+            kind: ClusterRole
+            name: allow-anonymous-openid
+          subjects:
+          - kind: User
+            name: system:anonymous
+            apiGroup: rbac.authorization.k8s.io
+          EOF
+
+      - name: Set Kubernetes Config
+        if: ${{ matrix.auth-provider == 'oauth2_token' }}
+        run: |
+          echo "KUBERNETES_API_SERVER_URL=$(kubectl get --raw /.well-known/openid-configuration| jq -r .jwks_uri)" >> $GITHUB_ENV
+          echo "KUBERNETES_CA_CERT_PATH=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}')" >> $GITHUB_ENV
+          echo "KUBERNETES_ISSUER=$(kubectl get --raw /.well-known/openid-configuration| jq -r .issuer)" >> $GITHUB_ENV
+          echo "KUBERNETES_AUDIENCE=$(kubectl create token llama-stack-auth -n llama-stack --duration=1h | cut -d. -f2 | base64 -d | jq -r '.aud[0]')" >> $GITHUB_ENV
+
+      - name: Set Kube Auth Config and run server
+        env:
+          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+        if: ${{ matrix.auth-provider == 'oauth2_token' }}
+        run: |
+          run_dir=$(mktemp -d)
+          cat <<'EOF' > $run_dir/run.yaml
+          version: '2'
+          image_name: kube
+          apis: []
+          providers: {}
+          server:
+            port: 8321
+          EOF
+          yq eval '.server.auth = {"provider_type": "${{ matrix.auth-provider }}"}' -i $run_dir/run.yaml
+          yq eval '.server.auth.config = {"tls_cafile": "${{ env.KUBERNETES_CA_CERT_PATH }}", "issuer": "${{ env.KUBERNETES_ISSUER }}", "audience": "${{ env.KUBERNETES_AUDIENCE }}"}' -i $run_dir/run.yaml
+          yq eval '.server.auth.config.jwks = {"uri": "${{ env.KUBERNETES_API_SERVER_URL }}"}' -i $run_dir/run.yaml
+          cat $run_dir/run.yaml
+
+          nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &
+
+      - name: Wait for Llama Stack server to be ready
+        run: |
+          echo "Waiting for Llama Stack server..."
+          for i in {1..30}; do
+            if curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://localhost:8321/v1/health | grep -q "OK"; then
+              echo "Llama Stack server is up!"
+              if grep -q "Enabling authentication with provider: ${{ matrix.auth-provider }}" server.log; then
+                echo "Llama Stack server is configured to use ${{ matrix.auth-provider }} auth"
+                exit 0
+              else
+                echo "Llama Stack server is not configured to use ${{ matrix.auth-provider }} auth"
+                cat server.log
+                exit 1
+              fi
+            fi
+            sleep 1
+          done
+          echo "Llama Stack server failed to start"
+          cat server.log
+          exit 1
+
+      - name: Test auth
+        run: |
+          curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers|jq
--- a/.github/workflows_upstream/integration-tests.yml
+++ b/.github/workflows_upstream/integration-tests.yml
@ -0,0 +1,116 @@
+name: Integration Tests
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'llama_stack/**'
+      - 'tests/integration/**'
+      - 'uv.lock'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - '.github/workflows/integration-tests.yml' # This workflow
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test-matrix:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        # Listing tests manually since some of them currently fail
+        # TODO: generate matrix list from tests/integration when fixed
+        test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime]
+        client-type: [library, http]
+      fail-fast: false # we want to run all tests regardless of failure
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
+
+      - name: Setup ollama
+        uses: ./.github/actions/setup-ollama
+
+      - name: Build Llama Stack
+        run: |
+          llama stack build --template ollama --image-type venv
+
+      - name: Start Llama Stack server in background
+        if: matrix.client-type == 'http'
+        env:
+          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+        run: |
+          LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv &
+
+      - name: Wait for Llama Stack server to be ready
+        if: matrix.client-type == 'http'
+        run: |
+          echo "Waiting for Llama Stack server..."
+          for i in {1..30}; do
+            if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
+              echo "Llama Stack server is up!"
+              exit 0
+            fi
+            sleep 1
+          done
+          echo "Llama Stack server failed to start"
+          cat server.log
+          exit 1
+
+      - name: Verify Ollama status is OK
+        if: matrix.client-type == 'http'
+        run: |
+          echo "Verifying Ollama status..."
+          ollama_status=$(curl -s -L http://127.0.0.1:8321/v1/providers/ollama|jq --raw-output .health.status)
+          echo "Ollama status: $ollama_status"
+          if [ "$ollama_status" != "OK" ]; then
+            echo "Ollama health check failed"
+            exit 1
+          fi
+
+      - name: Check Storage and Memory Available Before Tests
+        if: ${{ always() }}
+        run: |
+          free -h
+          df -h
+
+      - name: Run Integration Tests
+        env:
+          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+        run: |
+          if [ "${{ matrix.client-type }}" == "library" ]; then
+            stack_config="ollama"
+          else
+            stack_config="http://localhost:8321"
+          fi
+          uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
+            -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
+            --text-model="meta-llama/Llama-3.2-3B-Instruct" \
+            --embedding-model=all-MiniLM-L6-v2
+
+      - name: Check Storage and Memory Available After Tests
+        if: ${{ always() }}
+        run: |
+          free -h
+          df -h
+
+      - name: Write ollama logs to file
+        if: ${{ always() }}
+        run: |
+          sudo journalctl -u ollama.service > ollama.log
+
+      - name: Upload all logs to artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}
+          path: |
+            *.log
+          retention-days: 1
--- a/.github/workflows_upstream/pre-commit.yml
+++ b/.github/workflows_upstream/pre-commit.yml
@ -0,0 +1,45 @@
+name: Pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Set up Python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: '3.11'
+          cache: pip
+          cache-dependency-path: |
+            **/requirements*.txt
+            .pre-commit-config.yaml
+
+      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+        env:
+          SKIP: no-commit-to-branch
+          RUFF_OUTPUT_FORMAT: github
+
+      - name: Verify if there are any diff files after pre-commit
+        run: |
+          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
+
+      - name: Verify if there are any new files after pre-commit
+        run: |
+          unstaged_files=$(git ls-files --others --exclude-standard)
+          if [ -n "$unstaged_files" ]; then
+            echo "There are uncommitted new files, run pre-commit locally and commit again"
+            echo "$unstaged_files"
+            exit 1
+          fi
--- a/.github/workflows_upstream/providers-build.yml
+++ b/.github/workflows_upstream/providers-build.yml
@ -0,0 +1,147 @@
+name: Test Llama Stack Build
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'llama_stack/cli/stack/build.py'
+      - 'llama_stack/cli/stack/_build.py'
+      - 'llama_stack/distribution/build.*'
+      - 'llama_stack/distribution/*.sh'
+      - '.github/workflows/providers-build.yml'
+  pull_request:
+    paths:
+      - 'llama_stack/cli/stack/build.py'
+      - 'llama_stack/cli/stack/_build.py'
+      - 'llama_stack/distribution/build.*'
+      - 'llama_stack/distribution/*.sh'
+      - '.github/workflows/providers-build.yml'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  generate-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      templates: ${{ steps.set-matrix.outputs.templates }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Generate Template List
+        id: set-matrix
+        run: |
+          templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
+          echo "templates=$templates" >> "$GITHUB_OUTPUT"
+
+  build:
+    needs: generate-matrix
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
+        image-type: [venv, container]
+      fail-fast: false # We want to run all jobs even if some fail
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
+
+      - name: Print build dependencies
+        run: |
+          uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
+
+      - name: Run Llama Stack Build
+        run: |
+          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
+          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test
+
+      - name: Print dependencies in the image
+        if: matrix.image-type == 'venv'
+        run: |
+          uv pip list
+
+  build-single-provider:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
+
+      - name: Build a single provider
+        run: |
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --image-type venv --image-name test --providers inference=remote::ollama
+
+  build-custom-container-distribution:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
+
+      - name: Build a single provider
+        run: |
+          yq -i '.image_type = "container"' llama_stack/templates/starter/build.yaml
+          yq -i '.image_name = "test"' llama_stack/templates/starter/build.yaml
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/starter/build.yaml
+
+      - name: Inspect the container image entrypoint
+        run: |
+          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
+          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
+          echo "Entrypoint: $entrypoint"
+          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
+            echo "Entrypoint is not correct"
+            exit 1
+          fi
+
+  build-ubi9-container-distribution:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
+
+      - name: Pin template to UBI9 base
+        run: |
+          yq -i '
+            .image_type    = "container" |
+            .image_name    = "ubi9-test" |
+            .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
+          ' llama_stack/templates/starter/build.yaml
+
+      - name: Build dev container (UBI9)
+        env:
+          USE_COPY_NOT_MOUNT: "true"
+          LLAMA_STACK_DIR: "."
+        run: |
+          uv run llama stack build --config llama_stack/templates/starter/build.yaml
+
+      - name: Inspect UBI9 image
+        run: |
+          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
+          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
+          echo "Entrypoint: $entrypoint"
+          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
+            echo "Entrypoint is not correct"
+            exit 1
+          fi
+
+          echo "Checking /etc/os-release in $IMAGE_ID"
+          docker run --rm --entrypoint sh "$IMAGE_ID" -c \
+              'source /etc/os-release && echo "$ID"' \
+              | grep -qE '^(rhel|ubi)$' \
+              || { echo "Base image is not UBI 9!"; exit 1; }
--- a/.github/workflows_upstream/semantic-pr.yml
+++ b/.github/workflows_upstream/semantic-pr.yml
@ -8,6 +8,10 @@ on:
      - reopened
      - synchronize

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 permissions:
  contents: read

@ -16,6 +20,6 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Check PR Title's semantic conformance
-        uses: amannn/action-semantic-pull-request@v5
+        uses: amannn/action-semantic-pull-request@0723387faaf9b38adef4775cd42cfd5155ed6017 # v5.5.3
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows_upstream/stale_bot.yml
+++ b/.github/workflows_upstream/stale_bot.yml
@ -0,0 +1,45 @@
+name: Close stale issues and PRs
+
+on:
+  schedule:
+    - cron: '0 0 * * *' # every day at midnight
+
+env:
+  LC_ALL: en_US.UTF-8
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+
+jobs:
+  stale:
+    permissions:
+      issues: write
+      pull-requests: write
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stale Action
+        uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
+        with:
+          stale-issue-label: 'stale'
+          stale-issue-message: >
+            This issue has been automatically marked as stale because it has not had activity within 60 days.
+            It will be automatically closed if no further activity occurs within 30 days.
+          close-issue-message: >
+            This issue has been automatically closed due to inactivity.
+            Please feel free to reopen if you feel it is still relevant!
+          days-before-issue-stale: 60
+          days-before-issue-close: 30
+          stale-pr-label: 'stale'
+          stale-pr-message: >
+            This pull request has been automatically marked as stale because it has not had activity within 60 days.
+            It will be automatically closed if no further activity occurs within 30 days.
+          close-pr-message: >
+            This pull request has been automatically closed due to inactivity.
+            Please feel free to reopen if you intend to continue working on it!
+          days-before-pr-stale: 60
+          days-before-pr-close: 30
+          operations-per-run: 300
--- a/.github/workflows_upstream/test-external-providers.yml
+++ b/.github/workflows_upstream/test-external-providers.yml
@ -0,0 +1,71 @@
+name: Test External Providers
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'llama_stack/**'
+      - 'tests/integration/**'
+      - 'uv.lock'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - '.github/workflows/test-external-providers.yml' # This workflow
+
+jobs:
+  test-external-providers:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        image-type: [venv]
+        # We don't do container yet, it's tricky to install a package from the host into the
+        # container and point 'uv pip install' to the correct path...
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
+
+      - name: Apply image type to config file
+        run: |
+          yq -i '.image_type = "${{ matrix.image-type }}"' tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
+          cat tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
+
+      - name: Setup directory for Ollama custom provider
+        run: |
+          mkdir -p tests/external-provider/llama-stack-provider-ollama/src/
+          cp -a llama_stack/providers/remote/inference/ollama/ tests/external-provider/llama-stack-provider-ollama/src/llama_stack_provider_ollama
+
+      - name: Create provider configuration
+        run: |
+          mkdir -p /home/runner/.llama/providers.d/remote/inference
+          cp tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml
+
+      - name: Build distro from config file
+        run: |
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
+
+      - name: Start Llama Stack server in background
+        if: ${{ matrix.image-type }} == 'venv'
+        env:
+          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+        run: |
+          uv run pip list
+          nohup uv run --active llama stack run tests/external-provider/llama-stack-provider-ollama/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
+
+      - name: Wait for Llama Stack server to be ready
+        run: |
+          for i in {1..30}; do
+            if ! grep -q "remote::custom_ollama from /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml" server.log; then
+              echo "Waiting for Llama Stack server to load the provider..."
+              sleep 1
+            else
+              echo "Provider loaded"
+              exit 0
+            fi
+          done
+          echo "Provider failed to load"
+          cat server.log
+          exit 1
--- a/.github/workflows_upstream/tests.yml
+++ b/.github/workflows_upstream/tests.yml
@ -20,7 +20,7 @@ jobs:
      matrix:
        provider: [fireworks, together]
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          ref: ${{ github.event.inputs.commit_sha }}

--- a/.github/workflows_upstream/unit-tests.yml
+++ b/.github/workflows_upstream/unit-tests.yml
@ -0,0 +1,52 @@
+name: Unit Tests
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'llama_stack/**'
+      - 'tests/unit/**'
+      - 'uv.lock'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - '.github/workflows/unit-tests.yml' # This workflow
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-tests:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python:
+          - "3.10"
+          - "3.11"
+          - "3.12"
+          - "3.13"
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
+
+      - name: Run unit tests
+        run: |
+          PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --cov=llama_stack --junitxml=pytest-report-${{ matrix.python }}.xml --cov-report=html:htmlcov-${{ matrix.python }}
+
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: test-results-${{ matrix.python }}
+          path: |
+            .pytest_cache/
+            pytest-report-${{ matrix.python }}.xml
+            htmlcov-${{ matrix.python }}/
+          retention-days: 7
--- a/.github/workflows_upstream/update-readthedocs.yml
+++ b/.github/workflows_upstream/update-readthedocs.yml
@ -14,6 +14,8 @@ on:
      - 'docs/**'
      - 'pyproject.toml'
      - '.github/workflows/update-readthedocs.yml'
+    tags:
+      - '*'
  pull_request:
    branches:
      - main
@ -22,6 +24,10 @@ on:
      - 'pyproject.toml'
      - '.github/workflows/update-readthedocs.yml'

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  update-readthedocs:
    runs-on: ubuntu-latest
@ -29,18 +35,10 @@ jobs:
      TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Install the latest version of uv
-        uses: astral-sh/setup-uv@v5
-
-      - name: Sync with uv
-        run: uv sync --extra docs
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner

      - name: Build HTML
        run: |
@ -57,7 +55,10 @@ jobs:

          response=$(curl -X POST \
            -H "Content-Type: application/json" \
-            -d "{\"token\": \"$TOKEN\"}" \
+            -d "{
+              \"token\": \"$TOKEN\",
+              \"version\": \"$GITHUB_REF_NAME\"
+            }" \
            https://readthedocs.org/api/v2/webhook/llama-stack/289768/)

          echo "Response: $response"
--- a/.gitignore
+++ b/.gitignore
@ -6,6 +6,7 @@ dev_requirements.txt
 build
 .DS_Store
 llama_stack/configs/*
+.cursor/
 xcuserdata/
 *.hmap
 .DS_Store
@ -22,3 +23,5 @@ pyrightconfig.json
 venv/
 pytest-report.xml
 .coverage
+.python-version
+data
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -8,12 +8,25 @@ repos:
    rev: v5.0.0  # Latest stable version
    hooks:
    -   id: check-merge-conflict
+        args: ['--assume-in-merge']
    -   id: trailing-whitespace
        exclude: '\.py$'  # Exclude Python files as Ruff already handles them
    -   id: check-added-large-files
        args: ['--maxkb=1000']
    -   id: end-of-file-fixer
        exclude: '^(.*\.svg)$'
+    -   id: no-commit-to-branch
+    -   id: check-yaml
+        args: ["--unsafe"]
+    -   id: detect-private-key
+    -   id: requirements-txt-fixer
+    -   id: mixed-line-ending
+        args: [--fix=lf] # Forces to replace line ending by LF (line feed)
+    -   id: check-executables-have-shebangs
+    -   id: check-json
+    -   id: check-shebang-scripts-are-executable
+    -   id: check-symlinks
+    -   id: check-toml

 -   repo: https://github.com/Lucas-C/pre-commit-hooks
    rev: v1.5.4
@ -40,7 +53,7 @@ repos:
        - black==24.3.0

 -   repo: https://github.com/astral-sh/uv-pre-commit
-    rev: 0.6.3
+    rev: 0.7.8
    hooks:
    -   id: uv-lock
    -   id: uv-export
@ -48,6 +61,7 @@ repos:
            "--frozen",
            "--no-hashes",
            "--no-emit-project",
+            "--no-default-groups",
            "--output-file=requirements.txt"
        ]

@ -75,12 +89,29 @@ repos:
      - id: distro-codegen
        name: Distribution Template Codegen
        additional_dependencies:
-          - uv==0.6.0
-        entry: uv run --extra codegen python -m llama_stack.scripts.distro_codegen
+          - uv==0.7.8
+        entry: uv run --group codegen ./scripts/distro_codegen.py
        language: python
        pass_filenames: false
        require_serial: true
        files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
+      - id: openapi-codegen
+        name: API Spec Codegen
+        additional_dependencies:
+          - uv==0.7.8
+        entry: sh -c 'uv run ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
+        language: python
+        pass_filenames: false
+        require_serial: true
+        files: ^llama_stack/apis/|^docs/openapi_generator/
+      - id: check-workflows-use-hashes
+        name: Check GitHub Actions use SHA-pinned actions
+        entry: ./scripts/check-workflows-use-hashes.sh
+        language: system
+        pass_filenames: false
+        require_serial: true
+        always_run: true
+        files: ^\.github/workflows/.*\.ya?ml$

 ci:
    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
--- a/.python-version
+++ b/.python-version
@ -1 +0,0 @@
-3.10
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -5,28 +5,21 @@
 # Required
 version: 2

+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: docs/source/conf.py
+
 # Set the OS, Python version and other tools you might need
 build:
  os: ubuntu-22.04
  tools:
    python: "3.12"
-    # You can also specify other tool versions:
-    # nodejs: "19"
-    # rust: "1.64"
-    # golang: "1.19"
-
-# Build documentation in the "docs/" directory with Sphinx
-sphinx:
-  configuration: docs/source/conf.py
-
-# Optionally build your docs in additional formats such as PDF and ePub
-# formats:
-#    - pdf
-#    - epub
-
-# Optional but recommended, declare the Python requirements required
-# to build your documentation
-# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
-python:
+  jobs:
+    pre_create_environment:
+      - asdf plugin add uv
+      - asdf install uv latest
+      - asdf global uv latest
+    create_environment:
+      - uv venv "${READTHEDOCS_VIRTUALENV_PATH}"
    install:
-   - requirements: docs/requirements.txt
+      - UV_PROJECT_ENVIRONMENT="${READTHEDOCS_VIRTUALENV_PATH}" uv sync --frozen --group docs
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,183 @@
 # Changelog

+# v0.2.7
+Published on: 2025-05-16T20:38:10Z
+
+## Highlights
+
+This is a small update. But a couple highlights:
+
+* feat: function tools in OpenAI Responses by @bbrowning in https://github.com/meta-llama/llama-stack/pull/2094, getting closer to ready. Streaming is the next missing piece.
+* feat: Adding support for customizing chunk context in RAG insertion and querying by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/2134
+* feat: scaffolding for Llama Stack UI by @ehhuang in https://github.com/meta-llama/llama-stack/pull/2149, more to come in the coming releases.
+
+
+---
+
+# v0.2.6
+Published on: 2025-05-12T18:06:52Z
+
+
+
+---
+
+# v0.2.5
+Published on: 2025-05-04T20:16:49Z
+
+
+
+---
+
+# v0.2.4
+Published on: 2025-04-29T17:26:01Z
+
+## Highlights
+
+* One-liner to install and run Llama Stack yay! by @reluctantfuturist in https://github.com/meta-llama/llama-stack/pull/1383
+* support for NVIDIA NeMo datastore by @raspawar in https://github.com/meta-llama/llama-stack/pull/1852
+* (yuge!) Kubernetes authentication by @leseb in https://github.com/meta-llama/llama-stack/pull/1778
+* (yuge!) OpenAI Responses API by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1989
+* add api.llama provider, llama-guard-4 model by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2058
+
+
+---
+
+# v0.2.3
+Published on: 2025-04-25T22:46:21Z
+
+## Highlights
+
+* OpenAI compatible inference endpoints and client-SDK support. `client.chat.completions.create()` now works.
+* significant improvements and functionality added to the nVIDIA distribution
+* many improvements to the test verification suite.
+* new inference providers: Ramalama, IBM WatsonX
+* many improvements to the Playground UI
+
+
+---
+
+# v0.2.2
+Published on: 2025-04-13T01:19:49Z
+
+## Main changes
+
+- Bring Your Own Provider (@leseb) - use out-of-tree provider code to execute the distribution server
+- OpenAI compatible inference API in progress (@bbrowning)
+- Provider verifications (@ehhuang)
+- Many updates and fixes to playground
+- Several llama4 related fixes
+
+
+---
+
+# v0.2.1
+Published on: 2025-04-05T23:13:00Z
+
+
+
+---
+
+# v0.2.0
+Published on: 2025-04-05T19:04:29Z
+
+## Llama 4 Support
+
+Checkout more at https://www.llama.com
+
+
+
+---
+
+# v0.1.9
+Published on: 2025-03-29T00:52:23Z
+
+### Build and Test Agents
+* Agents: Entire document context with attachments
+* RAG: Documentation with sqlite-vec faiss comparison
+* Getting started: Fixes to getting started notebook.
+
+### Agent Evals and Model Customization
+* (**New**) Post-training: Add nemo customizer
+
+### Better Engineering
+* Moved sqlite-vec to non-blocking calls
+* Don't return a payload on file delete
+
+
+
+---
+
+# v0.1.8
+Published on: 2025-03-24T01:28:50Z
+
+# v0.1.8 Release Notes
+
+### Build and Test Agents
+* Safety: Integrated NVIDIA as a safety provider.
+* VectorDB: Added Qdrant as an inline provider.
+* Agents: Added support for multiple tool groups in agents.
+* Agents: Simplified imports for Agents in client package
+
+
+### Agent Evals and Model Customization
+* Introduced DocVQA and IfEval benchmarks.
+
+### Deploying and Monitoring Agents
+* Introduced a Containerfile and image workflow for the Playground.
+* Implemented support for Bearer (API Key) authentication.
+* Added attribute-based access control for resources.
+* Fixes on docker deployments: use --pull always and standardized the default port to 8321
+* Deprecated: /v1/inspect/providers use /v1/providers/ instead
+
+### Better Engineering
+* Consolidated scripts under the ./scripts directory.
+* Addressed mypy violations in various modules.
+* Added Dependabot scans for Python dependencies.
+* Implemented a scheduled workflow to update the changelog automatically.
+* Enforced concurrency to reduce CI loads.
+
+
+### New Contributors
+* @cmodi-meta made their first contribution in https://github.com/meta-llama/llama-stack/pull/1650
+* @jeffmaury made their first contribution in https://github.com/meta-llama/llama-stack/pull/1671
+* @derekhiggins made their first contribution in https://github.com/meta-llama/llama-stack/pull/1698
+* @Bobbins228 made their first contribution in https://github.com/meta-llama/llama-stack/pull/1745
+
+**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.7...v0.1.8
+
+---
+
+# v0.1.7
+Published on: 2025-03-14T22:30:51Z
+
+## 0.1.7 Release Notes
+
+###  Build and Test Agents
+* Inference: ImageType is now refactored to LlamaStackImageType
+* Inference: Added tests to measure TTFT
+* Inference: Bring back usage metrics
+* Agents: Added endpoint for get agent, list agents and list sessions
+* Agents: Automated conversion of type hints in client tool for lite llm format
+* Agents: Deprecated ToolResponseMessage in agent.resume API
+* Added Provider API for listing and inspecting provider info
+
+### Agent Evals and Model Customization
+* Eval: Added new eval benchmarks Math 500 and BFCL v3
+* Deploy and Monitoring of Agents
+* Telemetry: Fix tracing to work across coroutines
+
+###  Better Engineering
+* Display code coverage for unit tests
+* Updated call sites (inference, tool calls, agents) to move to async non blocking calls
+* Unit tests also run on Python 3.11, 3.12, and 3.13
+* Added ollama inference to Integration tests CI
+* Improved documentation across examples, testing, CLI, updated providers table )
+
+
+
+
+---
+
 # v0.1.6
 Published on: 2025-03-08T04:35:08Z

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -61,6 +61,7 @@ outlined on that page and do not file a public issue.

 We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
 You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
+
 You can install the dependencies by running:

 ```bash
@ -70,17 +71,24 @@ uv pip install -e .
 source .venv/bin/activate
 ```

+> [!NOTE]
+> You can pin a specific version of Python to use for `uv` by adding a `.python-version` file in the root project directory.
+> Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
+> For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
+
 Note that you can create a dotenv file `.env` that includes necessary environment variables:
 ```
 LLAMA_STACK_BASE_URL=http://localhost:8321
 LLAMA_STACK_CLIENT_LOG=debug
 LLAMA_STACK_PORT=8321
-LLAMA_STACK_CONFIG=
+LLAMA_STACK_CONFIG=<provider-name>
+TAVILY_SEARCH_API_KEY=
+BRAVE_SEARCH_API_KEY=
 ```

 And then use this dotenv file when running client SDK tests via the following:
 ```bash
-uv run --env-file .env -- pytest -v tests/api/inference/test_text_inference.py
+uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
 ```

 ## Pre-commit Hooks
@ -102,6 +110,10 @@ uv run pre-commit run --all-files
 > [!CAUTION]
 > Before pushing your changes, make sure that the pre-commit hooks have passed successfully.

+## Running tests
+
+You can find the Llama Stack testing documentation here [here](tests/README.md).
+
 ## Adding a new dependency to the project

 To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:
@ -113,9 +125,20 @@ uv sync

 ## Coding Style

-* 4 spaces for indentation rather than tabs
-* 80 character line length
-* ...
+* Comments should provide meaningful insights into the code. Avoid filler comments that simply
+  describe the next step, as they create unnecessary clutter, same goes for docstrings.
+* Prefer comments to clarify surprising behavior and/or relationships between parts of the code
+  rather than explain what the next line of code does.
+* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like
+  `Exception`.
+* Error messages should be prefixed with "Failed to ..."
+* 4 spaces for indentation rather than tab
+* When using `# noqa` to suppress a style or linter warning, include a comment explaining the
+  justification for bypassing the check.
+* When using `# type: ignore` to suppress a mypy warning, include a comment explaining the
+  justification for bypassing the check.
+* Don't use unicode characters in the codebase. ASCII-only is preferred for compatibility or
+  readability reasons.

 ## Common Tasks

@ -137,21 +160,18 @@ LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama

 ### Updating Provider Configurations

-If you have made changes to a provider's configuration in any form (introducing a new config key, or changing models, etc.), you should run `python llama_stack/scripts/distro_codegen.py` to re-generate various YAML files as well as the documentation. You should not change `docs/source/.../distributions/` files manually as they are auto-generated.
+If you have made changes to a provider's configuration in any form (introducing a new config key, or changing models, etc.), you should run `./scripts/distro_codegen.py` to re-generate various YAML files as well as the documentation. You should not change `docs/source/.../distributions/` files manually as they are auto-generated.

 ### Building the Documentation

 If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.

 ```bash
-cd llama-stack/docs
-uv sync --extra docs
-
 # This rebuilds the documentation pages.
-uv run make html
+uv run --group docs make -C docs/ html

 # This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
-uv run sphinx-autobuild source build/html --write-all
+uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
 ```

 ### Update API Documentation
@ -159,7 +179,6 @@ uv run sphinx-autobuild source build/html --write-all
 If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command:

 ```bash
-uv sync --extra dev
 uv run ./docs/openapi_generator/run_openapi_generator.sh
 ```

--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,8 +1,9 @@
 include pyproject.toml
-include distributions/dependencies.json
 include llama_stack/models/llama/llama3/tokenizer.model
+include llama_stack/models/llama/llama4/tokenizer.model
 include llama_stack/distribution/*.sh
 include llama_stack/cli/scripts/*.sh
 include llama_stack/templates/*/*.yaml
 include llama_stack/providers/tests/test_cases/inference/*.json
 include llama_stack/models/llama/*/*.md
+include llama_stack/tests/integration/*.jpg
--- a/README.md
+++ b/README.md
@ -3,9 +3,82 @@
 [![PyPI version](https://img.shields.io/pypi/v/llama_stack.svg)](https://pypi.org/project/llama_stack/)
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/)
 [![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE)
-[![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack)
+[![Discord](https://img.shields.io/discord/1257833999603335178?color=6A7EC2&logo=discord&logoColor=ffffff)](https://discord.gg/llama-stack)
+[![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
+[![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)

-[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb)
+[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
+
+### ✨🎉 Llama 4 Support  🎉✨
+We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
+
+<details>
+
+<summary>👋 Click here to see how to run Llama 4 models on Llama Stack </summary>
+
+\
+*Note you need 8xH100 GPU-host to run these models*
+
+```bash
+pip install -U llama_stack
+
+MODEL="Llama-4-Scout-17B-16E-Instruct"
+# get meta url from llama.com
+llama model download --source meta --model-id $MODEL --meta-url <META_URL>
+
+# start a llama stack server
+INFERENCE_MODEL=meta-llama/$MODEL llama stack build --run --template meta-reference-gpu
+
+# install client to interact with the server
+pip install llama-stack-client
+```
+### CLI
+```bash
+# Run a chat completion
+llama-stack-client --endpoint http://localhost:8321 \
+inference chat-completion \
+--model-id meta-llama/$MODEL \
+--message "write a haiku for meta's llama 4 models"
+
+ChatCompletionResponse(
+    completion_message=CompletionMessage(content="Whispers in code born\nLlama's gentle, wise heartbeat\nFuture's soft unfold", role='assistant', stop_reason='end_of_turn', tool_calls=[]),
+    logprobs=None,
+    metrics=[Metric(metric='prompt_tokens', value=21.0, unit=None), Metric(metric='completion_tokens', value=28.0, unit=None), Metric(metric='total_tokens', value=49.0, unit=None)]
+)
+```
+### Python SDK
+```python
+from llama_stack_client import LlamaStackClient
+
+client = LlamaStackClient(base_url=f"http://localhost:8321")
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+prompt = "Write a haiku about coding"
+
+print(f"User> {prompt}")
+response = client.inference.chat_completion(
+    model_id=model_id,
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": prompt},
+    ],
+)
+print(f"Assistant> {response.completion_message.content}")
+```
+As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!
+
+
+</details>
+
+### 🚀 One-Line Installer 🚀
+
+To try Llama Stack locally, run:
+
+```bash
+curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/install.sh | sh
+```
+
+### Overview

 Llama Stack standardizes the core building blocks that simplify AI application development. It codifies best practices across the Llama ecosystem. More specifically, it provides

@ -34,22 +107,30 @@ By reducing friction and complexity, Llama Stack empowers developers to focus on
 ### API Providers
 Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack.

-| **API Provider Builder** |    **Environments**    | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
-|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|
-|      Meta Reference      |      Single Node       |     ✅      |       ✅       |     ✅      |     ✅      |       ✅       |
-|        SambaNova         |         Hosted         |            |       ✅       |            |            |               |
-|         Cerebras         |         Hosted         |            |       ✅       |            |            |               |
-|        Fireworks         |         Hosted         |     ✅      |       ✅       |     ✅      |            |               |
-|       AWS Bedrock        |         Hosted         |            |       ✅       |            |     ✅      |               |
-|         Together         |         Hosted         |     ✅      |       ✅       |            |     ✅      |               |
-|           Groq           |         Hosted         |            |       ✅       |            |            |               |
-|          Ollama          |      Single Node       |            |       ✅       |            |            |               |
-|           TGI            | Hosted and Single Node |            |       ✅       |            |            |               |
-|        NVIDIA NIM        | Hosted and Single Node |            |       ✅       |            |            |               |
-|          Chroma          |      Single Node       |            |               |     ✅      |            |               |
-|        PG Vector         |      Single Node       |            |               |     ✅      |            |               |
-|    PyTorch ExecuTorch    |     On-device iOS      |     ✅      |       ✅       |            |            |               |
-|           vLLM           | Hosted and Single Node |            |       ✅       |            |            |               |
+| **API Provider Builder** |    **Environments**    | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | **Post Training** |
+|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|:-----------------:|
+|      Meta Reference      |      Single Node       |     ✅      |       ✅       |     ✅      |     ✅      |       ✅       |               |
+|        SambaNova         |         Hosted         |            |       ✅       |            |     ✅      |               |                  |
+|         Cerebras         |         Hosted         |            |       ✅       |            |            |               |                  |
+|        Fireworks         |         Hosted         |     ✅      |       ✅       |     ✅      |            |               |                |
+|       AWS Bedrock        |         Hosted         |            |       ✅       |            |     ✅      |               |                |
+|         Together         |         Hosted         |     ✅      |       ✅       |            |     ✅      |               |                |
+|           Groq           |         Hosted         |            |       ✅       |            |            |               |                 |
+|          Ollama          |      Single Node       |            |       ✅       |            |            |               |                 |
+|           TGI            | Hosted and Single Node |            |       ✅       |            |            |               |                 |
+|        NVIDIA NIM        | Hosted and Single Node |            |       ✅       |            |            |               |                 |
+|          Chroma          |      Single Node       |            |               |     ✅      |            |               |                 |
+|        PG Vector         |      Single Node       |            |               |     ✅      |            |               |                 |
+|    PyTorch ExecuTorch    |     On-device iOS      |     ✅      |       ✅       |            |            |               |                |
+|           vLLM           | Hosted and Single Node |            |       ✅       |            |            |               |                 |
+|          OpenAI          |         Hosted         |            |       ✅       |            |            |               |                 |
+|        Anthropic         |         Hosted         |            |       ✅       |            |            |               |                 |
+|          Gemini          |         Hosted         |            |       ✅       |            |            |               |                 |
+|          watsonx         |         Hosted         |            |       ✅       |            |            |               |                 |
+|        HuggingFace       |       Single Node      |            |                |            |            |               |       ✅        |
+|         TorchTune        |       Single Node      |            |                |            |            |               |       ✅        |
+|       NVIDIA NEMO        |         Hosted         |            |                |            |            |               |       ✅        |
+

 ### Distributions

@ -58,7 +139,6 @@ A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider
 |               **Distribution**                |                                                                    **Llama Stack Docker**                                                                     |                                                 Start This Distribution                                                  |
 |:---------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------:|
 |                Meta Reference                 |           [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general)           |      [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html)      |
-|           Meta Reference Quantized            | [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-quantized-gpu.html) |
 |                   SambaNova                   |                     [llamastack/distribution-sambanova](https://hub.docker.com/repository/docker/llamastack/distribution-sambanova/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/sambanova.html)   |
 |                   Cerebras                    |                     [llamastack/distribution-cerebras](https://hub.docker.com/repository/docker/llamastack/distribution-cerebras/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/cerebras.html)   |
 |                    Ollama                     |                       [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general)                       |            [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html)            |
@ -67,26 +147,6 @@ A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider
 |                   Fireworks                   |                    [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general)                    |          [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/fireworks.html)           |
 | vLLM |                  [llamastack/distribution-remote-vllm](https://hub.docker.com/repository/docker/llamastack/distribution-remote-vllm/general)                  |         [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html)          |

-### Installation
-
-You have two ways to install this repository:
-
-* **Install as a package**:
-   You can install the repository directly from [PyPI](https://pypi.org/project/llama-stack/) by running the following command:
-   ```bash
-   pip install llama-stack
-   ```
-
-* **Install from source**:
-   If you prefer to install from the source code, we recommend using [uv](https://github.com/astral-sh/uv).
-   Then, run the following commands:
-   ```bash
-    git clone git@github.com:meta-llama/llama-stack.git
-    cd llama-stack
-
-    uv sync
-    uv pip install -e .
-   ```

 ### Documentation

--- a/distributions/bedrock/build.yaml
+++ b/distributions/bedrock/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/bedrock/build.yaml
--- a/distributions/bedrock/compose.yaml
+++ b/distributions/bedrock/compose.yaml
@ -1,15 +0,0 @@
-services:
-  llamastack:
-    image: distribution-bedrock
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/llamastack-run-bedrock.yaml
-    ports:
-      - "8321:8321"
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/bedrock/run.yaml
+++ b/distributions/bedrock/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/bedrock/run.yaml
--- a/distributions/cerebras/build.yaml
+++ b/distributions/cerebras/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/cerebras/build.yaml
--- a/distributions/cerebras/compose.yaml
+++ b/distributions/cerebras/compose.yaml
@ -1,16 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-cerebras
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/llamastack-run-cerebras.yaml
-    ports:
-      - "8321:8321"
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-cerebras.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/cerebras/run.yaml
+++ b/distributions/cerebras/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/cerebras/run.yaml
--- a/distributions/dell-tgi/compose.yaml
+++ b/distributions/dell-tgi/compose.yaml
@ -1,50 +0,0 @@
-services:
-  text-generation-inference:
-    image: registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct
-    network_mode: "host"
-    volumes:
-      - $HOME/.cache/huggingface:/data
-    ports:
-      - "5009:5009"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=0,1,2,3,4
-      - NUM_SHARD=4
-      - MAX_BATCH_PREFILL_TOKENS=32768
-      - MAX_INPUT_TOKENS=8000
-      - MAX_TOTAL_TOKENS=8192
-    command: []
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            # that's the closest analogue to --gpus; provide
-            # an integer amount of devices or 'all'
-            count: all
-            # Devices are reserved using a list of capabilities, making
-            # capabilities the only required field. A device MUST
-            # satisfy all the requested capabilities for a successful
-            # reservation.
-            capabilities: [gpu]
-    runtime: nvidia
-  llamastack:
-    depends_on:
-      text-generation-inference:
-        condition: service_healthy
-    image: llamastack/distribution-tgi
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      # Link to TGI run.yaml file
-      - ./run.yaml:/root/my-run.yaml
-    ports:
-      - "8321:8321"
-    # Hack: wait for TGI server to start before starting docker
-    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
-    restart_policy:
-      condition: on-failure
-      delay: 3s
-      max_attempts: 5
-      window: 60s
--- a/distributions/dell-tgi/run.yaml
+++ b/distributions/dell-tgi/run.yaml
@ -1,44 +0,0 @@
-version: '2'
-image_name: local
-container_image: null
-conda_env: local
-apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
-providers:
-  inference:
-  - provider_id: tgi0
-    provider_type: remote::tgi
-    config:
-      url: http://127.0.0.1:80
-  safety:
-  - provider_id: meta0
-    provider_type: inline::llama-guard
-    config:
-      model: Llama-Guard-3-1B
-      excluded_categories: []
-  - provider_id: meta1
-    provider_type: inline::prompt-guard
-    config:
-      model: Prompt-Guard-86M
-  memory:
-  - provider_id: meta0
-    provider_type: inline::faiss
-    config: {}
-  agents:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/kvstore.db
-  telemetry:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
--- a/distributions/fireworks/build.yaml
+++ b/distributions/fireworks/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/fireworks/build.yaml
--- a/distributions/fireworks/compose.yaml
+++ b/distributions/fireworks/compose.yaml
@ -1,14 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-fireworks
-    ports:
-      - "8321:8321"
-    environment:
-      - FIREWORKS_API_KEY=${FIREWORKS_API_KEY}
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --template fireworks"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/fireworks/run.yaml
+++ b/distributions/fireworks/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/fireworks/run.yaml
--- a/distributions/meta-reference-gpu/build.yaml
+++ b/distributions/meta-reference-gpu/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/meta-reference-gpu/build.yaml
--- a/distributions/meta-reference-gpu/compose.yaml
+++ b/distributions/meta-reference-gpu/compose.yaml
@ -1,34 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-meta-reference-gpu
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/my-run.yaml
-    ports:
-      - "8321:8321"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=0
-    command: []
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            # that's the closest analogue to --gpus; provide
-            # an integer amount of devices or 'all'
-            count: 1
-            # Devices are reserved using a list of capabilities, making
-            # capabilities the only required field. A device MUST
-            # satisfy all the requested capabilities for a successful
-            # reservation.
-            capabilities: [gpu]
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
-    runtime: nvidia
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
--- a/distributions/meta-reference-gpu/run-with-safety.yaml
+++ b/distributions/meta-reference-gpu/run-with-safety.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
--- a/distributions/meta-reference-gpu/run.yaml
+++ b/distributions/meta-reference-gpu/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/meta-reference-gpu/run.yaml
--- a/distributions/meta-reference-quantized-gpu/build.yaml
+++ b/distributions/meta-reference-quantized-gpu/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml
--- a/distributions/meta-reference-quantized-gpu/compose.yaml
+++ b/distributions/meta-reference-quantized-gpu/compose.yaml
@ -1,35 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-meta-reference-quantized-gpu
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/my-run.yaml
-    ports:
-      - "8321:8321"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=0
-    command: []
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            # that's the closest analogue to --gpus; provide
-            # an integer amount of devices or 'all'
-            count: 1
-            # Devices are reserved using a list of capabilities, making
-            # capabilities the only required field. A device MUST
-            # satisfy all the requested capabilities for a successful
-            # reservation.
-            capabilities: [gpu]
-    runtime: nvidia
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/meta-reference-quantized-gpu/run.yaml
+++ b/distributions/meta-reference-quantized-gpu/run.yaml
@ -1,58 +0,0 @@
-version: '2'
-image_name: local
-container_image: null
-conda_env: local
-apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
-providers:
-  inference:
-  - provider_id: meta0
-    provider_type: inline::meta-reference-quantized
-    config:
-      model: Llama3.2-3B-Instruct:int4-qlora-eo8
-      quantization:
-        type: int4
-      torch_seed: null
-      max_seq_len: 2048
-      max_batch_size: 1
-  - provider_id: meta1
-    provider_type: inline::meta-reference-quantized
-    config:
-      # not a quantized model !
-      model: Llama-Guard-3-1B
-      quantization: null
-      torch_seed: null
-      max_seq_len: 2048
-      max_batch_size: 1
-  safety:
-  - provider_id: meta0
-    provider_type: inline::llama-guard
-    config:
-      model: Llama-Guard-3-1B
-      excluded_categories: []
-  - provider_id: meta1
-    provider_type: inline::prompt-guard
-    config:
-      model: Prompt-Guard-86M
-  memory:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
-  agents:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/kvstore.db
-  telemetry:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
--- a/distributions/ollama/build.yaml
+++ b/distributions/ollama/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/ollama/build.yaml
--- a/distributions/ollama/compose.yaml
+++ b/distributions/ollama/compose.yaml
@ -1,71 +0,0 @@
-services:
-  ollama:
-    image: ollama/ollama:latest
-    network_mode: ${NETWORK_MODE:-bridge}
-    volumes:
-      - ~/.ollama:/root/.ollama
-    ports:
-      - "11434:11434"
-    environment:
-      OLLAMA_DEBUG: 1
-    command: []
-    deploy:
-      resources:
-        limits:
-          memory: 8G    # Set maximum memory
-        reservations:
-          memory: 8G    # Set minimum memory reservation
-    # healthcheck:
-    #   # ugh, no CURL in ollama image
-    #   test: ["CMD", "curl", "-f", "http://ollama:11434"]
-    #   interval: 10s
-    #   timeout: 5s
-    #   retries: 5
-
-  ollama-init:
-    image: ollama/ollama:latest
-    depends_on:
-      - ollama
-        # condition: service_healthy
-    network_mode: ${NETWORK_MODE:-bridge}
-    environment:
-      - OLLAMA_HOST=ollama
-      - INFERENCE_MODEL=${INFERENCE_MODEL}
-      - SAFETY_MODEL=${SAFETY_MODEL:-}
-    volumes:
-      - ~/.ollama:/root/.ollama
-      - ./pull-models.sh:/pull-models.sh
-    entrypoint: ["/pull-models.sh"]
-
-  llamastack:
-    depends_on:
-      ollama:
-        condition: service_started
-      ollama-init:
-        condition: service_started
-    image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama}
-    network_mode: ${NETWORK_MODE:-bridge}
-    volumes:
-      - ~/.llama:/root/.llama
-      # Link to ollama run.yaml file
-      - ~/local/llama-stack/:/app/llama-stack-source
-      - ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
-    ports:
-      - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
-    environment:
-      - INFERENCE_MODEL=${INFERENCE_MODEL}
-      - SAFETY_MODEL=${SAFETY_MODEL:-}
-      - OLLAMA_URL=http://ollama:11434
-    entrypoint: >
-        python -m llama_stack.distribution.server.server /root/my-run.yaml \
-        --port ${LLAMA_STACK_PORT:-5001}
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 10s
-        max_attempts: 3
-        window: 60s
-volumes:
-  ollama:
-  ollama-init:
-  llamastack:
--- a/distributions/ollama/pull-models.sh
+++ b/distributions/ollama/pull-models.sh
@ -1,18 +0,0 @@
-#!/bin/sh
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..."
-for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do
-  echo "Preloading $model..."
-  if ! ollama run "$model"; then
-    echo "Failed to pull and run $model"
-    exit 1
-  fi
-done
-
-echo "All models pulled successfully"
--- a/distributions/ollama/run-with-safety.yaml
+++ b/distributions/ollama/run-with-safety.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/ollama/run-with-safety.yaml
--- a/distributions/ollama/run.yaml
+++ b/distributions/ollama/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/ollama/run.yaml
--- a/distributions/remote-nvidia/build.yaml
+++ b/distributions/remote-nvidia/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/nvidia/build.yaml
--- a/distributions/remote-nvidia/compose.yaml
+++ b/distributions/remote-nvidia/compose.yaml
@ -1,19 +0,0 @@
-services:
-  llamastack:
-    image: distribution-nvidia:dev
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/llamastack-run-nvidia.yaml
-    ports:
-      - "8321:8321"
-    environment:
-      - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
-      - NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/remote-nvidia/run.yaml
+++ b/distributions/remote-nvidia/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/nvidia/run.yaml
--- a/distributions/remote-vllm/build.yaml
+++ b/distributions/remote-vllm/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/remote-vllm/build.yaml
--- a/distributions/remote-vllm/compose.yaml
+++ b/distributions/remote-vllm/compose.yaml
@ -1,100 +0,0 @@
-services:
-  vllm-inference:
-    image: vllm/vllm-openai:latest
-    volumes:
-      - $HOME/.cache/huggingface:/root/.cache/huggingface
-    network_mode: ${NETWORK_MODE:-bridged}
-    ports:
-       - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
-      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
-    command: >
-      --gpu-memory-utilization 0.75
-      --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
-      --enforce-eager
-      --max-model-len 8192
-      --max-num-seqs 16
-      --port ${VLLM_INFERENCE_PORT:-5100}
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
-      interval: 30s
-      timeout: 10s
-      retries: 5
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            capabilities: [gpu]
-    runtime: nvidia
-
-  # A little trick:
-  # if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
-  # otherwise, the entry will end in a hyphen which gets ignored by docker compose
-  vllm-${VLLM_SAFETY_MODEL:+safety}:
-    image: vllm/vllm-openai:latest
-    volumes:
-      - $HOME/.cache/huggingface:/root/.cache/huggingface
-    network_mode: ${NETWORK_MODE:-bridged}
-    ports:
-      - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
-      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
-    command: >
-      --gpu-memory-utilization 0.75
-      --model ${VLLM_SAFETY_MODEL}
-      --enforce-eager
-      --max-model-len 8192
-      --max-num-seqs 16
-      --port ${VLLM_SAFETY_PORT:-5101}
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
-      interval: 30s
-      timeout: 10s
-      retries: 5
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            capabilities: [gpu]
-    runtime: nvidia
-  llamastack:
-    depends_on:
-      - vllm-inference:
-          condition: service_healthy
-      - vllm-${VLLM_SAFETY_MODEL:+safety}:
-          condition: service_healthy
-    # image: llamastack/distribution-remote-vllm
-    image: llamastack/distribution-remote-vllm:test-0.0.52rc3
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
-    network_mode: ${NETWORK_MODE:-bridged}
-    environment:
-      - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
-      - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
-      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
-      - MAX_TOKENS=${MAX_TOKENS:-4096}
-      - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
-      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
-    ports:
-      - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
-    # Hack: wait for vLLM server to start before starting docker
-    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
-volumes:
-  vllm-inference:
-  vllm-safety:
-  llamastack:
--- a/distributions/remote-vllm/run-with-safety.yaml
+++ b/distributions/remote-vllm/run-with-safety.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/remote-vllm/run-with-safety.yaml
--- a/distributions/remote-vllm/run.yaml
+++ b/distributions/remote-vllm/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/remote-vllm/run.yaml
--- a/distributions/runpod/build.yaml
+++ b/distributions/runpod/build.yaml
@ -1,9 +0,0 @@
-name: runpod
-distribution_spec:
-  description: Use Runpod for running LLM inference
-  providers:
-    inference: remote::runpod
-    memory: meta-reference
-    safety: meta-reference
-    agents: meta-reference
-    telemetry: meta-reference
--- a/distributions/sambanova/build.yaml
+++ b/distributions/sambanova/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/sambanova/build.yaml
--- a/distributions/sambanova/compose.yaml
+++ b/distributions/sambanova/compose.yaml
@ -1,16 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-sambanova
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/llamastack-run-sambanova.yaml
-    ports:
-      - "5000:5000"
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-sambanova.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/sambanova/run.yaml
+++ b/distributions/sambanova/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/sambanova/run.yaml
--- a/distributions/tgi/build.yaml
+++ b/distributions/tgi/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/tgi/build.yaml
--- a/distributions/tgi/compose.yaml
+++ b/distributions/tgi/compose.yaml
@ -1,103 +0,0 @@
-services:
-  tgi-inference:
-    image: ghcr.io/huggingface/text-generation-inference:latest
-    volumes:
-      - $HOME/.cache/huggingface:/data
-    network_mode: ${NETWORK_MODE:-bridged}
-    ports:
-       - "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
-      - HF_TOKEN=$HF_TOKEN
-      - HF_HOME=/data
-      - HF_DATASETS_CACHE=/data
-      - HF_MODULES_CACHE=/data
-      - HF_HUB_CACHE=/data
-    command: >
-      --dtype bfloat16
-      --usage-stats off
-      --sharded false
-      --model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
-      --port ${TGI_INFERENCE_PORT:-8080}
-      --cuda-memory-fraction 0.75
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
-      interval: 5s
-      timeout: 5s
-      retries: 30
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            capabilities: [gpu]
-    runtime: nvidia
-
-  tgi-${TGI_SAFETY_MODEL:+safety}:
-    image: ghcr.io/huggingface/text-generation-inference:latest
-    volumes:
-      - $HOME/.cache/huggingface:/data
-    network_mode: ${NETWORK_MODE:-bridged}
-    ports:
-       - "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
-      - HF_TOKEN=$HF_TOKEN
-      - HF_HOME=/data
-      - HF_DATASETS_CACHE=/data
-      - HF_MODULES_CACHE=/data
-      - HF_HUB_CACHE=/data
-    command: >
-      --dtype bfloat16
-      --usage-stats off
-      --sharded false
-      --model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
-      --port ${TGI_SAFETY_PORT:-8081}
-      --cuda-memory-fraction 0.75
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
-      interval: 5s
-      timeout: 5s
-      retries: 30
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            capabilities: [gpu]
-    runtime: nvidia
-
-  llamastack:
-    depends_on:
-      tgi-inference:
-        condition: service_healthy
-      tgi-${TGI_SAFETY_MODEL:+safety}:
-        condition: service_healthy
-    image: llamastack/distribution-tgi:test-0.0.52rc3
-    network_mode: ${NETWORK_MODE:-bridged}
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
-    ports:
-      - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
-    # Hack: wait for TGI server to start before starting docker
-    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
-    restart_policy:
-      condition: on-failure
-      delay: 3s
-      max_attempts: 5
-      window: 60s
-    environment:
-      - TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
-      - SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
-      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
-      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
-
-volumes:
-  tgi-inference:
-  tgi-safety:
-  llamastack:
--- a/distributions/tgi/run-with-safety.yaml
+++ b/distributions/tgi/run-with-safety.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/tgi/run-with-safety.yaml
--- a/distributions/tgi/run.yaml
+++ b/distributions/tgi/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/tgi/run.yaml
--- a/distributions/together/build.yaml
+++ b/distributions/together/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/together/build.yaml
--- a/distributions/together/compose.yaml
+++ b/distributions/together/compose.yaml
@ -1,14 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-together
-    ports:
-      - "8321:8321"
-    environment:
-      - TOGETHER_API_KEY=${TOGETHER_API_KEY}
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --template together"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/together/run.yaml
+++ b/distributions/together/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/together/run.yaml
--- a/distributions/vllm-gpu/build.yaml
+++ b/distributions/vllm-gpu/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/inline-vllm/build.yaml
--- a/distributions/vllm-gpu/compose.yaml
+++ b/distributions/vllm-gpu/compose.yaml
@ -1,35 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-inline-vllm
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/my-run.yaml
-    ports:
-      - "8321:8321"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=0
-    command: []
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            # that's the closest analogue to --gpus; provide
-            # an integer amount of devices or 'all'
-            count: 1
-            # Devices are reserved using a list of capabilities, making
-            # capabilities the only required field. A device MUST
-            # satisfy all the requested capabilities for a successful
-            # reservation.
-            capabilities: [gpu]
-    runtime: nvidia
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/vllm-gpu/run.yaml
+++ b/distributions/vllm-gpu/run.yaml
@ -1,66 +0,0 @@
-version: '2'
-image_name: local
-container_image: null
-conda_env: local
-apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
-providers:
-  inference:
-  - provider_id: vllm-inference
-    provider_type: inline::vllm
-    config:
-      model: Llama3.2-3B-Instruct
-      tensor_parallel_size: 1
-      gpu_memory_utilization: 0.4
-      enforce_eager: true
-      max_tokens: 4096
-  - provider_id: vllm-inference-safety
-    provider_type: inline::vllm
-    config:
-      model: Llama-Guard-3-1B
-      tensor_parallel_size: 1
-      gpu_memory_utilization: 0.2
-      enforce_eager: true
-      max_tokens: 4096
-  safety:
-  - provider_id: meta0
-    provider_type: inline::llama-guard
-    config:
-      model: Llama-Guard-3-1B
-      excluded_categories: []
-  # Uncomment to use prompt guard
-  # - provider_id: meta1
-  #   provider_type: inline::prompt-guard
-  #   config:
-  #     model: Prompt-Guard-86M
-  memory:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
-  # Uncomment to use pgvector
-  # - provider_id: pgvector
-  #   provider_type: remote::pgvector
-  #   config:
-  #     host: 127.0.0.1
-  #     port: 5432
-  #     db: postgres
-  #     user: postgres
-  #     password: mysecretpassword
-  agents:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/agents_store.db
-  telemetry:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
--- a/docs/_static/css/my_theme.css
+++ b/docs/_static/css/my_theme.css
@ -16,3 +16,20 @@
 .hide-title h1 {
    display: none;
 }
+
+h2, h3, h4 {
+    font-weight: normal;
+}
+html[data-theme="dark"] .rst-content div[class^="highlight"] {
+  background-color: #0b0b0b;
+}
+pre {
+    white-space: pre-wrap !important;
+    word-break: break-all;
+}
+
+[data-theme="dark"] .mermaid {
+    background-color: #f4f4f6 !important;
+    border-radius: 6px;
+    padding: 0.5em;
+  }
--- a/docs/_static/js/detect_theme.js
+++ b/docs/_static/js/detect_theme.js
@ -0,0 +1,32 @@
+document.addEventListener("DOMContentLoaded", function () {
+  const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
+  const htmlElement = document.documentElement;
+
+  // Check if theme is saved in localStorage
+  const savedTheme = localStorage.getItem("sphinx-rtd-theme");
+
+  if (savedTheme) {
+    // Use the saved theme preference
+    htmlElement.setAttribute("data-theme", savedTheme);
+    document.body.classList.toggle("dark", savedTheme === "dark");
+  } else {
+    // Fall back to system preference
+    const theme = prefersDark ? "dark" : "light";
+    htmlElement.setAttribute("data-theme", theme);
+    document.body.classList.toggle("dark", theme === "dark");
+    // Save initial preference
+    localStorage.setItem("sphinx-rtd-theme", theme);
+  }
+
+  // Listen for theme changes from the existing toggle
+  const observer = new MutationObserver(function(mutations) {
+    mutations.forEach(function(mutation) {
+      if (mutation.attributeName === "data-theme") {
+        const currentTheme = htmlElement.getAttribute("data-theme");
+        localStorage.setItem("sphinx-rtd-theme", currentTheme);
+      }
+    });
+  });
+
+  observer.observe(htmlElement, { attributes: true });
+});
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
--- a/docs/_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
--- a/docs/_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
--- a/docs/_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
--- a/docs/conftest.py
+++ b/docs/conftest.py
@ -4,6 +4,21 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import os
+import time
+
+
 def pytest_collection_modifyitems(items):
    for item in items:
        item.name = item.name.replace(' ', '_') 
+
+
+def pytest_runtest_teardown(item):
+    interval_seconds = os.getenv("LLAMA_STACK_TEST_INTERVAL_SECONDS")
+    if interval_seconds:
+        time.sleep(float(interval_seconds))
+
+
+def pytest_configure(config):
+    config.option.tbstyle = "short"
+    config.option.disable_warnings = True
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
--- a/docs/getting_started_llama_api.ipynb
+++ b/docs/getting_started_llama_api.ipynb
--- a/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
+++ b/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
--- a/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
+++ b/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
@ -47,9 +47,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from llama_stack_client import LlamaStackClient\n",
+    "from llama_stack_client import LlamaStackClient, Agent\n",
    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
-    "from llama_stack_client.lib.agents.agent import Agent\n",
    "from rich.pretty import pprint\n",
    "import json\n",
    "import uuid\n",
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
--- a/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
+++ b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
@ -22,7 +22,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
@ -34,10 +34,8 @@
    }
   ],
   "source": [
-    "from llama_stack_client import LlamaStackClient\n",
+    "from llama_stack_client import LlamaStackClient, Agent\n",
    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
-    "from llama_stack_client.types.agent_create_params import AgentConfig\n",
-    "from llama_stack_client.lib.agents.agent import Agent\n",
    "from rich.pretty import pprint\n",
    "import json\n",
    "import uuid\n",
@ -70,7 +68,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
@ -842,7 +840,6 @@
    "    \"memory_optimizations.rst\",\n",
    "    \"chat.rst\",\n",
    "    \"llama3.rst\",\n",
-    "    \"datasets.rst\",\n",
    "    \"qat_finetune.rst\",\n",
    "    \"lora_finetune.rst\",\n",
    "]\n",
@ -1397,6 +1394,348 @@
    "pprint(session_response.turns)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.1 Improved RAG with Long Context\n",
+    "\n",
+    "- Instead of performing reteival tool, we send documents as attachments to the agent and let it use the entire document context. \n",
+    "- Note how that the model is able to understand the entire context from documentation and answers the question with better factuality with improved retrieval. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> What precision formats does torchtune support?\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;36mQuestion:\u001b[0m What precision formats does torchtune support?\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> Torchtune supports two precision formats: `fp32` <span style=\"font-weight: bold\">(</span>full-precision<span style=\"font-weight: bold\">)</span> and `bfloat16` <span style=\"font-weight: bold\">(</span>half-precision<span style=\"font-weight: bold\">)</span>. \n",
+       "The `bfloat16` format uses <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span> bytes per model parameter, which is half the memory of `fp32`, and also improves \n",
+       "training speed.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;33mAgent Answer:\u001b[0m Torchtune supports two precision formats: `fp32` \u001b[1m(\u001b[0mfull-precision\u001b[1m)\u001b[0m and `bfloat16` \u001b[1m(\u001b[0mhalf-precision\u001b[1m)\u001b[0m. \n",
+       "The `bfloat16` format uses \u001b[1;36m2\u001b[0m bytes per model parameter, which is half the memory of `fp32`, and also improves \n",
+       "training speed.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> What does DoRA stand for in torchtune?\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;36mQuestion:\u001b[0m What does DoRA stand for in torchtune?\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> DoRA stands for Weight-Decomposed Low-Rank Adaptation. It is a variant of LoRA <span style=\"font-weight: bold\">(</span>Low-Rank Adaptation<span style=\"font-weight: bold\">)</span> \n",
+       "that further decomposes the pre-trained weights into two components: magnitude and direction. The magnitude \n",
+       "component is a scalar vector that adjusts the scale, while the direction component corresponds to the original LoRA\n",
+       "decomposition and updates the orientation of weights. DoRA adds a small overhead to LoRA training due to the \n",
+       "addition of the magnitude parameter, but it has been shown to improve the performance of LoRA, particularly at low \n",
+       "ranks.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;33mAgent Answer:\u001b[0m DoRA stands for Weight-Decomposed Low-Rank Adaptation. It is a variant of LoRA \u001b[1m(\u001b[0mLow-Rank Adaptation\u001b[1m)\u001b[0m \n",
+       "that further decomposes the pre-trained weights into two components: magnitude and direction. The magnitude \n",
+       "component is a scalar vector that adjusts the scale, while the direction component corresponds to the original LoRA\n",
+       "decomposition and updates the orientation of weights. DoRA adds a small overhead to LoRA training due to the \n",
+       "addition of the magnitude parameter, but it has been shown to improve the performance of LoRA, particularly at low \n",
+       "ranks.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> How does the CPUOffloadOptimizer reduce GPU memory usage?\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;36mQuestion:\u001b[0m How does the CPUOffloadOptimizer reduce GPU memory usage?\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> The CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to the \n",
+       "CPU, and performing optimizer steps on the CPU. This can significantly reduce GPU memory usage at the cost of CPU \n",
+       "RAM and training speed.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;33mAgent Answer:\u001b[0m The CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to the \n",
+       "CPU, and performing optimizer steps on the CPU. This can significantly reduce GPU memory usage at the cost of CPU \n",
+       "RAM and training speed.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> How do I ensure only LoRA parameters are trainable when fine-tuning?\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;36mQuestion:\u001b[0m How do I ensure only LoRA parameters are trainable when fine-tuning?\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> To ensure only LoRA parameters are trainable when fine-tuning, you can use the `set_trainable_params`\n",
+       "function from `torchtune.modules.peft.peft_utils` to set the `requires_grad` attribute of the LoRA parameters to \n",
+       "`<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>` and the `requires_grad` attribute of the other parameters to `<span style=\"color: #ff0000; text-decoration-color: #ff0000; font-style: italic\">False</span>`.\n",
+       "\n",
+       "Here is an example:\n",
+       "```python\n",
+       "from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n",
+       "\n",
+       "# Get the LoRA parameters\n",
+       "lora_params = <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">get_adapter_params</span><span style=\"font-weight: bold\">(</span>model<span style=\"font-weight: bold\">)</span>\n",
+       "\n",
+       "# Set the LoRA parameters to trainable and the other parameters to non-trainable\n",
+       "<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">set_trainable_params</span><span style=\"font-weight: bold\">(</span>model, lora_params<span style=\"font-weight: bold\">)</span>\n",
+       "```\n",
+       "This will ensure that only the LoRA parameters are updated during fine-tuning, while the other parameters remain \n",
+       "frozen.\n",
+       "\n",
+       "Alternatively, you can also use the `lora_finetune` recipe in torchtune, which automatically sets the LoRA \n",
+       "parameters to trainable and the other parameters to non-trainable. You can run the recipe using the following \n",
+       "command:\n",
+       "```bash\n",
+       "tune run lora_finetune --config llama2/7B_lora\n",
+       "```\n",
+       "This will fine-tune the LoRA parameters of the Llama2 model using the default settings. You can modify the config \n",
+       "file to change the hyperparameters or the model architecture.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;33mAgent Answer:\u001b[0m To ensure only LoRA parameters are trainable when fine-tuning, you can use the `set_trainable_params`\n",
+       "function from `torchtune.modules.peft.peft_utils` to set the `requires_grad` attribute of the LoRA parameters to \n",
+       "`\u001b[3;92mTrue\u001b[0m` and the `requires_grad` attribute of the other parameters to `\u001b[3;91mFalse\u001b[0m`.\n",
+       "\n",
+       "Here is an example:\n",
+       "```python\n",
+       "from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n",
+       "\n",
+       "# Get the LoRA parameters\n",
+       "lora_params = \u001b[1;35mget_adapter_params\u001b[0m\u001b[1m(\u001b[0mmodel\u001b[1m)\u001b[0m\n",
+       "\n",
+       "# Set the LoRA parameters to trainable and the other parameters to non-trainable\n",
+       "\u001b[1;35mset_trainable_params\u001b[0m\u001b[1m(\u001b[0mmodel, lora_params\u001b[1m)\u001b[0m\n",
+       "```\n",
+       "This will ensure that only the LoRA parameters are updated during fine-tuning, while the other parameters remain \n",
+       "frozen.\n",
+       "\n",
+       "Alternatively, you can also use the `lora_finetune` recipe in torchtune, which automatically sets the LoRA \n",
+       "parameters to trainable and the other parameters to non-trainable. You can run the recipe using the following \n",
+       "command:\n",
+       "```bash\n",
+       "tune run lora_finetune --config llama2/7B_lora\n",
+       "```\n",
+       "This will fine-tune the LoRA parameters of the Llama2 model using the default settings. You can modify the config \n",
+       "file to change the hyperparameters or the model architecture.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "urls = [\n",
+    "    \"memory_optimizations.rst\",\n",
+    "    \"chat.rst\",\n",
+    "    \"llama3.rst\",\n",
+    "    \"qat_finetune.rst\",\n",
+    "    \"lora_finetune.rst\",\n",
+    "]\n",
+    "\n",
+    "attachments = [\n",
+    "    {\n",
+    "        \"content\": {\n",
+    "            \"uri\": f\"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}\",\n",
+    "        },\n",
+    "        \"mime_type\": \"text/plain\",\n",
+    "    }\n",
+    "\n",
+    "    for i, url in enumerate(urls)\n",
+    "]\n",
+    "\n",
+    "rag_attachment_agent = Agent(\n",
+    "    client,\n",
+    "    model=MODEL_ID,\n",
+    "    instructions=\"You are a helpful assistant that can answer questions about the Torchtune project. Use context from attached documentation for Torchtune to answer questions.\",\n",
+    ")\n",
+    "\n",
+    "for example in examples:\n",
+    "    session_id = rag_attachment_agent.create_session(session_name=f\"rag_attachment_session_{uuid.uuid4()}\")\n",
+    "    response = rag_attachment_agent.create_turn(\n",
+    "        messages=[\n",
+    "            {\n",
+    "                \"role\": \"user\",\n",
+    "                \"content\": example[\"input_query\"]\n",
+    "            }\n",
+    "        ],\n",
+    "        session_id=session_id,\n",
+    "        documents=attachments,\n",
+    "        stream=False\n",
+    "    )\n",
+    "    rich.print(f\"[bold cyan]Question:[/bold cyan] {example['input_query']}\")\n",
+    "    rich.print(f\"[bold yellow]Agent Answer:[/bold yellow] {response.output_message.content}\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">ScoringScoreResponse</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"color: #808000; text-decoration-color: #808000\">results</span>=<span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'braintrust::factuality'</span>: <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">ScoringResult</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">aggregated_results</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'average'</span>: <span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'average'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span><span style=\"font-weight: bold\">}}</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">score_rows</span>=<span style=\"font-weight: bold\">[</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'B'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. Both the expert and the submitted answers mention that Torchtune supports two precision formats: `fp32` (full-precision) and `bfloat16` (half-precision).\\n2. The expert answer specifies that `fp32` uses 4 bytes per model and optimizer parameter, while `bfloat16` uses 2 bytes per model and optimizer parameter.\\n3. The submitted answer also mentions that `bfloat16` uses 2 bytes per model parameter, which is consistent with the expert answer.\\n4. The submitted answer adds that `bfloat16` improves training speed, which is additional information not present in the expert answer.\\n5. There is no conflict between the submitted answer and the expert answer; the submitted answer simply provides more information.\\n\\nBased on this analysis, the submitted answer is a superset of the expert answer and is fully consistent with it.'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'B'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. The expert answer provides the definition of DoRA as \"Weight-Decomposed Low-Rank Adaptation.\"\\n2. The submitted answer also states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation,\" which matches the expert answer.\\n3. The submitted answer includes additional information about DoRA, explaining that it is a variant of LoRA and describing how it decomposes pre-trained weights into magnitude and direction components.\\n4. The submitted answer further explains the role of the magnitude component and the direction component, and mentions the performance improvement and overhead associated with DoRA.\\n5. The additional details in the submitted answer do not contradict the expert answer; instead, they expand upon it.\\n6. Therefore, the submitted answer is a superset of the expert answer and is fully consistent with it.'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'B'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. The expert answer states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on CPU and performing optimizer steps on CPU. It also mentions the optional offloading of gradients to CPU with the parameter offload_gradients=True.\\n\\n2. The submitted answer states that the CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to the CPU, and performing optimizer steps on the CPU. It adds that this can significantly reduce GPU memory usage at the cost of CPU RAM and training speed.\\n\\n3. Comparing both answers:\\n   - Both answers agree on offloading optimizer states to the CPU and performing optimizer steps on the CPU.\\n   - Both mention the offloading of gradients to the CPU, but the expert answer specifies it as optional with a parameter, while the submission does not specify this detail.\\n   - The submission adds additional information about the trade-off involving CPU RAM and training speed, which is not mentioned in the expert answer.\\n\\n4. The submitted answer includes all the details from the expert answer and adds more information about the trade-offs, making it a superset of the expert answer.\\n\\nTherefore, the correct choice is (B) The submitted answer is a superset of the expert answer and is fully consistent with it.'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'B'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">\"1. **Expert Answer Analysis**: The expert answer provides a method to ensure only LoRA parameters are trainable by using torchtune's utility functions. It mentions fetching LoRA parameters with `get_adapter_params(lora_model)` and setting them as trainable with `set_trainable_params(lora_model, lora_params)`. It also notes that the LoRA recipe handles this automatically.\\n\\n2. **Submitted Answer Analysis**: The submitted answer provides a similar method using `set_trainable_params` to set the `requires_grad` attribute of LoRA parameters to `True` and other parameters to `False`. It includes a code example demonstrating this process. Additionally, it mentions using the `lora_finetune` recipe in torchtune, which automatically sets the LoRA parameters to trainable.\\n\\n3. **Comparison**: The submitted answer includes all the details from the expert answer regarding the use of `get_adapter_params` and `set_trainable_params`. It also provides additional information about setting the `requires_grad` attribute and using the `lora_finetune` recipe, which is not mentioned in the expert answer.\\n\\n4. **Conclusion**: The submitted answer is a superset of the expert answer as it contains all the information from the expert answer and additional details. There is no conflict between the two answers, and the additional information in the submission is consistent with the expert's explanation.\\n\\nTherefore, the correct choice is (B) The submitted answer is a superset of the expert answer and is fully consistent with it.\"</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"font-weight: bold\">]</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"font-weight: bold\">)</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"font-weight: bold\">)</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;35mScoringScoreResponse\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   \u001b[0m\u001b[33mresults\u001b[0m=\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   \u001b[0m\u001b[32m'braintrust::factuality'\u001b[0m: \u001b[1;35mScoringResult\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[33maggregated_results\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'average'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'average'\u001b[0m: \u001b[1;36m0.6\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[33mscore_rows\u001b[0m=\u001b[1m[\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. Both the expert and the submitted answers mention that Torchtune supports two precision formats: `fp32` \u001b[0m\u001b[32m(\u001b[0m\u001b[32mfull-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and `bfloat16` \u001b[0m\u001b[32m(\u001b[0m\u001b[32mhalf-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n2. The expert answer specifies that `fp32` uses 4 bytes per model and optimizer parameter, while `bfloat16` uses 2 bytes per model and optimizer parameter.\\n3. The submitted answer also mentions that `bfloat16` uses 2 bytes per model parameter, which is consistent with the expert answer.\\n4. The submitted answer adds that `bfloat16` improves training speed, which is additional information not present in the expert answer.\\n5. There is no conflict between the submitted answer and the expert answer; the submitted answer simply provides more information.\\n\\nBased on this analysis, the submitted answer is a superset of the expert answer and is fully consistent with it.'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. The expert answer provides the definition of DoRA as \"Weight-Decomposed Low-Rank Adaptation.\"\\n2. The submitted answer also states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation,\" which matches the expert answer.\\n3. The submitted answer includes additional information about DoRA, explaining that it is a variant of LoRA and describing how it decomposes pre-trained weights into magnitude and direction components.\\n4. The submitted answer further explains the role of the magnitude component and the direction component, and mentions the performance improvement and overhead associated with DoRA.\\n5. The additional details in the submitted answer do not contradict the expert answer; instead, they expand upon it.\\n6. Therefore, the submitted answer is a superset of the expert answer and is fully consistent with it.'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. The expert answer states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on CPU and performing optimizer steps on CPU. It also mentions the optional offloading of gradients to CPU with the parameter \u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m.\\n\\n2. The submitted answer states that the CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to the CPU, and performing optimizer steps on the CPU. It adds that this can significantly reduce GPU memory usage at the cost of CPU RAM and training speed.\\n\\n3. Comparing both answers:\\n   - Both answers agree on offloading optimizer states to the CPU and performing optimizer steps on the CPU.\\n   - Both mention the offloading of gradients to the CPU, but the expert answer specifies it as optional with a parameter, while the submission does not specify this detail.\\n   - The submission adds additional information about the trade-off involving CPU RAM and training speed, which is not mentioned in the expert answer.\\n\\n4. The submitted answer includes all the details from the expert answer and adds more information about the trade-offs, making it a superset of the expert answer.\\n\\nTherefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mB\u001b[0m\u001b[32m)\u001b[0m\u001b[32m The submitted answer is a superset of the expert answer and is fully consistent with it.'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m\"1. **Expert Answer Analysis**: The expert answer provides a method to ensure only LoRA parameters are trainable by using torchtune's utility functions. It mentions fetching LoRA parameters with `get_adapter_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model\u001b[0m\u001b[32m)\u001b[0m\u001b[32m` and setting them as trainable with `set_trainable_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model, lora_params\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`. It also notes that the LoRA recipe handles this automatically.\\n\\n2. **Submitted Answer Analysis**: The submitted answer provides a similar method using `set_trainable_params` to set the `requires_grad` attribute of LoRA parameters to `True` and other parameters to `False`. It includes a code example demonstrating this process. Additionally, it mentions using the `lora_finetune` recipe in torchtune, which automatically sets the LoRA parameters to trainable.\\n\\n3. **Comparison**: The submitted answer includes all the details from the expert answer regarding the use of `get_adapter_params` and `set_trainable_params`. It also provides additional information about setting the `requires_grad` attribute and using the `lora_finetune` recipe, which is not mentioned in the expert answer.\\n\\n4. **Conclusion**: The submitted answer is a superset of the expert answer as it contains all the information from the expert answer and additional details. There is no conflict between the two answers, and the additional information in the submission is consistent with the expert's explanation.\\n\\nTherefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mB\u001b[0m\u001b[32m)\u001b[0m\u001b[32m The submitted answer is a superset of the expert answer and is fully consistent with it.\"\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[1m]\u001b[0m\n",
+       "\u001b[2;32m│   │   \u001b[0m\u001b[1m)\u001b[0m\n",
+       "\u001b[2;32m│   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[1m)\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "eval_rows = []\n",
+    "for i, session_id in enumerate(rag_attachment_agent.sessions):\n",
+    "    session_response = client.agents.session.retrieve(agent_id=rag_attachment_agent.agent_id, session_id=session_id)\n",
+    "    for turn in session_response.turns:\n",
+    "        eval_rows.append({\n",
+    "            \"input_query\": examples[i][\"input_query\"],\n",
+    "            \"expected_answer\": examples[i][\"expected_answer\"],\n",
+    "            \"generated_answer\": turn.output_message.content,\n",
+    "        })\n",
+    "\n",
+    "scoring_params = {\n",
+    "    \"braintrust::factuality\": None,\n",
+    "}\n",
+    "scoring_response = client.scoring.score(\n",
+    "    input_rows=eval_rows,\n",
+    "    scoring_functions=scoring_params,\n",
+    ")\n",
+    "pprint(scoring_response)"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
--- a/docs/openapi_generator/README.md
+++ b/docs/openapi_generator/README.md
@ -1,9 +1 @@
 The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack/distribution/server/endpoints.py` using the `generate.py` utility.
-
-Please install the following packages before running the script:
-
-```
-pip install fire PyYAML
-```
-
-Then simply run `sh run_openapi_generator.sh`
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@ -12,7 +12,7 @@

 from datetime import datetime
 from pathlib import Path
-
+import sys
 import fire
 import ruamel.yaml as yaml

@ -21,7 +21,7 @@ from llama_stack.distribution.stack import LlamaStack  # noqa: E402

 from .pyopenapi.options import Options  # noqa: E402
 from .pyopenapi.specification import Info, Server  # noqa: E402
-from .pyopenapi.utility import Specification  # noqa: E402
+from .pyopenapi.utility import Specification, validate_api  # noqa: E402


 def str_presenter(dumper, data):
@ -39,11 +39,19 @@ def main(output_dir: str):
    if not output_dir.exists():
        raise ValueError(f"Directory {output_dir} does not exist")

+    # Validate API protocols before generating spec
+    return_type_errors = validate_api()
+    if return_type_errors:
+        print("\nAPI Method Return Type Validation Errors:\n")
+        for error in return_type_errors:
+            print(error, file=sys.stderr)
+        sys.exit(1)
    now = str(datetime.now())
    print(
        "Converting the spec to YAML (openapi.yaml) and HTML (openapi.html) at " + now
    )
    print("")
+
    spec = Specification(
        LlamaStack,
        Options(
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -6,6 +6,7 @@

 import hashlib
 import ipaddress
+import types
 import typing
 from dataclasses import make_dataclass
 from typing import Any, Dict, Set, Union
@ -179,7 +180,7 @@ class ContentBuilder:
        "Creates the content subtree for a request or response."

        def is_iterator_type(t):
-            return "StreamChunk" in str(t)
+            return "StreamChunk" in str(t) or "OpenAIResponseObjectStream" in str(t)

        def get_media_type(t):
            if is_generic_list(t):
@ -189,7 +190,7 @@ class ContentBuilder:
            else:
                return "application/json"

-        if typing.get_origin(payload_type) is typing.Union:
+        if typing.get_origin(payload_type) in (typing.Union, types.UnionType):
            media_types = []
            item_types = []
            for x in typing.get_args(payload_type):
@ -457,9 +458,9 @@ class Generator:
                        "status": 400,
                        "title": "Bad Request",
                        "detail": "The request was invalid or malformed",
-                    }
+                    },
                )
-            }
+            },
        )

        self.responses["TooManyRequests429"] = Response(
@ -471,9 +472,9 @@ class Generator:
                        "status": 429,
                        "title": "Too Many Requests",
                        "detail": "You have exceeded the rate limit. Please try again later.",
-                    }
+                    },
                )
-            }
+            },
        )

        self.responses["InternalServerError500"] = Response(
@ -485,9 +486,9 @@ class Generator:
                        "status": 500,
                        "title": "Internal Server Error",
                        "detail": "An unexpected error occurred. Our team has been notified.",
-                    }
+                    },
                )
-            }
+            },
        )

        # Add a default error response for any unhandled error cases
@ -500,9 +501,9 @@ class Generator:
                        "status": 0,
                        "title": "Error",
                        "detail": "An unexpected error occurred",
-                    }
+                    },
                )
-            }
+            },
        )

    def _build_type_tag(self, ref: str, schema: Schema) -> Tag:
@ -519,7 +520,7 @@ class Generator:
        )

    def _build_extra_tag_groups(
-        self, extra_types: Dict[str, List[type]]
+        self, extra_types: Dict[str, Dict[str, type]]
    ) -> Dict[str, List[Tag]]:
        """
        Creates a dictionary of tag group captions as keys, and tag lists as values.
@ -532,9 +533,8 @@ class Generator:
        for category_name, category_items in extra_types.items():
            tag_list: List[Tag] = []

-            for extra_type in category_items:
-                name = python_type_to_name(extra_type)
-                schema = self.schema_builder.classdef_to_named_schema(name, extra_type)
+            for name, extra_type in category_items.items():
+                schema = self.schema_builder.classdef_to_schema(extra_type)
                tag_list.append(self._build_type_tag(name, schema))

            if tag_list:
@ -547,11 +547,14 @@ class Generator:
            "SyntheticDataGeneration",
            "PostTraining",
            "BatchInference",
-            "Files",
        ]:
            op.defining_class.__name__ = f"{op.defining_class.__name__} (Coming Soon)"
            print(op.defining_class.__name__)

+        # TODO (xiyan): temporary fix for datasetio inner impl + datasets api
+        # if op.defining_class.__name__ in ["DatasetIO"]:
+        #     op.defining_class.__name__ = "Datasets"
+
        doc_string = parse_type(op.func_ref)
        doc_params = dict(
            (param.name, param.description) for param in doc_string.params.values()
@ -598,7 +601,9 @@ class Generator:

        # data passed in request body as raw bytes cannot have request parameters
        if raw_bytes_request_body and op.request_params:
-            raise ValueError("Cannot have both raw bytes request body and request parameters")
+            raise ValueError(
+                "Cannot have both raw bytes request body and request parameters"
+            )

        # data passed in request body as raw bytes
        if raw_bytes_request_body:
@ -754,7 +759,7 @@ class Generator:
        )

        return Operation(
-            tags=[op.defining_class.__name__],
+            tags=[getattr(op.defining_class, "API_NAMESPACE", op.defining_class.__name__)],
            summary=None,
            # summary=doc_string.short_description,
            description=description,
@ -800,6 +805,8 @@ class Generator:
        operation_tags: List[Tag] = []
        for cls in endpoint_classes:
            doc_string = parse_type(cls)
+            if hasattr(cls, "API_NAMESPACE") and cls.API_NAMESPACE != cls.__name__:
+                continue
            operation_tags.append(
                Tag(
                    name=cls.__name__,
@ -858,7 +865,7 @@ class Generator:
        for caption, extra_tag_group in extra_tag_groups.items():
            tag_groups.append(
                TagGroup(
-                    name=self.options.map(caption),
+                    name=caption,
                    tags=sorted(tag.name for tag in extra_tag_group),
                )
            )
--- a/docs/openapi_generator/pyopenapi/template.html
+++ b/docs/openapi_generator/pyopenapi/template.html
@ -6,8 +6,8 @@
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>OpenAPI specification</title>
    <link href="https://fonts.googleapis.com/css?family=Montserrat:300,400,700|Roboto:300,400,700" rel="stylesheet">
-    <script type="module" src="https://unpkg.com/@stoplight/elements/web-components.min.js"></script>
-    <link rel="stylesheet" href="https://unpkg.com/@stoplight/elements/styles.min.css">
+    <script type="module" src="https://cdn.jsdelivr.net/npm/@stoplight/elements/web-components.min.js"></script>
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@stoplight/elements/styles.min.css">
    <style>
        body {
            margin: 0;
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@ -6,16 +6,18 @@

 import json
 import typing
+import inspect
 from pathlib import Path
 from typing import TextIO
+from typing import Any, List, Optional, Union, get_type_hints, get_origin, get_args

 from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
+from llama_stack.distribution.resolver import api_protocol_map

 from .generator import Generator
 from .options import Options
 from .specification import Document

-
 THIS_DIR = Path(__file__).parent


@ -114,3 +116,147 @@ class Specification:
        )

        f.write(html)
+
+def is_optional_type(type_: Any) -> bool:
+    """Check if a type is Optional."""
+    origin = get_origin(type_)
+    args = get_args(type_)
+    return origin is Optional or (origin is Union and type(None) in args)
+
+
+def _validate_api_method_return_type(method) -> str | None:
+    hints = get_type_hints(method)
+
+    if 'return' not in hints:
+        return "has no return type annotation"
+
+    return_type = hints['return']
+    if is_optional_type(return_type):
+        return "returns Optional type where a return value is mandatory"
+
+
+def _validate_api_method_doesnt_return_list(method) -> str | None:
+    hints = get_type_hints(method)
+
+    if 'return' not in hints:
+        return "has no return type annotation"
+
+    return_type = hints['return']
+    if get_origin(return_type) is list:
+        return "returns a list where a PaginatedResponse or List*Response object is expected"
+
+
+def _validate_api_delete_method_returns_none(method) -> str | None:
+    hints = get_type_hints(method)
+
+    if 'return' not in hints:
+        return "has no return type annotation"
+
+    return_type = hints['return']
+    if return_type is not None and return_type is not type(None):
+        return "does not return None where None is mandatory"
+
+
+def _validate_list_parameters_contain_data(method) -> str | None:
+    hints = get_type_hints(method)
+
+    if 'return' not in hints:
+        return "has no return type annotation"
+
+    return_type = hints['return']
+    if not inspect.isclass(return_type):
+        return
+
+    if not return_type.__name__.startswith('List'):
+        return
+
+    if 'data' not in return_type.model_fields:
+        return "does not have a mandatory data attribute containing the list of objects"
+
+
+def _validate_has_ellipsis(method) -> str | None:
+    source = inspect.getsource(method)
+    if "..." not in source and not "NotImplementedError" in source:
+        return "does not contain ellipsis (...) in its implementation"
+
+def _validate_has_return_in_docstring(method) -> str | None:
+    source = inspect.getsource(method)
+    return_type = method.__annotations__.get('return')
+    if return_type is not None and return_type != type(None) and ":returns:" not in source:
+        return "does not have a ':returns:' in its docstring"
+
+def _validate_has_params_in_docstring(method) -> str | None:
+    source = inspect.getsource(method)
+    sig = inspect.signature(method)
+    # Only check if the method has more than one parameter
+    if len(sig.parameters) > 1 and ":param" not in source:
+        return "does not have a ':param' in its docstring"
+
+def _validate_has_no_return_none_in_docstring(method) -> str | None:
+    source = inspect.getsource(method)
+    return_type = method.__annotations__.get('return')
+    if return_type is None and ":returns: None" in source:
+        return "has a ':returns: None' in its docstring which is redundant for None-returning functions"
+
+def _validate_docstring_lines_end_with_dot(method) -> str | None:
+    docstring = inspect.getdoc(method)
+    if docstring is None:
+        return None
+
+    lines = docstring.split('\n')
+    for line in lines:
+        line = line.strip()
+        if line and not any(line.endswith(char) for char in '.:{}[]()",'):
+            return f"docstring line '{line}' does not end with a valid character: . : {{ }} [ ] ( ) , \""
+
+_VALIDATORS = {
+    "GET": [
+        _validate_api_method_return_type,
+        _validate_list_parameters_contain_data,
+        _validate_api_method_doesnt_return_list,
+        _validate_has_ellipsis,
+        _validate_has_return_in_docstring,
+        _validate_has_params_in_docstring,
+        _validate_docstring_lines_end_with_dot,
+    ],
+    "DELETE": [
+        _validate_api_delete_method_returns_none,
+        _validate_has_ellipsis,
+        _validate_has_return_in_docstring,
+        _validate_has_params_in_docstring,
+        _validate_has_no_return_none_in_docstring
+    ],
+    "POST": [
+        _validate_has_ellipsis,
+        _validate_has_return_in_docstring,
+        _validate_has_params_in_docstring,
+        _validate_has_no_return_none_in_docstring,
+        _validate_docstring_lines_end_with_dot,
+    ],
+}
+
+
+def _get_methods_by_type(protocol, method_type: str):
+    members = inspect.getmembers(protocol, predicate=inspect.isfunction)
+    return {
+        method_name: method
+        for method_name, method in members
+        if (webmethod := getattr(method, '__webmethod__', None))
+        if webmethod and webmethod.method == method_type
+    }
+
+
+def validate_api() -> List[str]:
+    """Validate the API protocols."""
+    errors = []
+    protocols = api_protocol_map()
+
+    for target, validators in _VALIDATORS.items():
+        for protocol_name, protocol in protocols.items():
+            for validator in validators:
+                for method_name, method in _get_methods_by_type(protocol, target).items():
+                    err = validator(method)
+                    if err:
+                        errors.append(f"Method {protocol_name}.{method_name} {err}")
+
+    return errors
--- a/docs/readme.md
+++ b/docs/readme.md
@ -2,6 +2,14 @@

 Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [ReadTheDocs page](https://llama-stack.readthedocs.io/en/latest/index.html).

+## Render locally
+
+From the llama-stack root directory, run the following command to render the docs locally:
+```bash
+uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
+```
+You can open up the docs in your browser at http://localhost:8000
+
 ## Content

 Try out Llama Stack's capabilities through our detailed Jupyter notebooks:
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1,14 +0,0 @@
-sphinx==8.1.3
-myst-parser
-linkify
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
-sphinx-rtd-theme>=1.0.0
-sphinx-pdj-theme
-sphinx-copybutton
-sphinx-tabs
-sphinx-design
-sphinxcontrib-openapi
-sphinxcontrib-redoc
-sphinxcontrib-mermaid
-sphinxcontrib-video
-tomli
--- a/Show more
+++ b/Show more
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/bedrock/build.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/bedrock/run.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/cerebras/build.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/cerebras/run.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/fireworks/build.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/fireworks/run.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/meta-reference-gpu/build.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/ollama/build.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/ollama/run-with-safety.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/nvidia/build.yaml`