Merge pull request #1 from meta-llama/main

Merging upstream changes
2025-08-09 19:58:29 +00:00 · 2025-02-13 11:16:22 -08:00 · 2025-02-13 11:16:22 -08:00 · eb1c5e86fe
commit eb1c5e86fe
parent 9f709387e2 efdd60014d
389 changed files with 10041 additions and 7739 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@

 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721
+* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan
--- a/.github/ISSUE_TEMPLATE/bug.yml
+++ b/.github/ISSUE_TEMPLATE/bug.yml
@ -1,6 +1,6 @@
 name: 🐛 Bug Report
 description: Create a report to help us reproduce and fix the bug
-
+labels: ["bug"]
 body:
  - type: markdown
    attributes:
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -0,0 +1,12 @@
+blank_issues_enabled: false
+
+contact_links:
+  - name: Have you read the docs?
+    url: https://llama-stack.readthedocs.io/en/latest/index.html
+    about: Much help can be found in the docs
+  - name: Start a discussion
+    url: https://github.com/meta-llama/llama-stack/discussions/new
+    about: Start a discussion on a topic
+  - name: Chat on Discord
+    url: https://discord.gg/llama-stack
+    about: Maybe chatting with the community can help
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@ -1,6 +1,6 @@
 name: 🚀 Feature request
 description: Request a new llama-stack feature
-
+labels: ["enhancement"]
 body:
 - type: textarea
  id: feature-pitch
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -1,27 +1,10 @@
 # What does this PR do?
+[Provide a short summary of what this PR does and why. Link to relevant issues if applicable.]

-In short, provide a summary of what this PR does and why. Usually, the relevant context should be present in a linked issue.
-
- [ ] Addresses issue (#issue)
-
+[//]: # (If resolving an issue, uncomment and update the line below)
+[//]: # (Closes #[issue-number])

 ## Test Plan
+[Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.*]

-Please describe:
- - tests you ran to verify your changes with result summaries.
- - provide instructions so it can be reproduced.
-
-
-## Sources
-
-Please link relevant resources if necessary.
-
-
-## Before submitting
-
- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
-      Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.
+[//]: # (## Documentation)
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -11,10 +11,10 @@ jobs:

    steps:
      - name: Checkout code
-        uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+        uses: actions/checkout@v4

      - name: Set up Python
-        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
          cache: pip
@ -22,4 +22,8 @@ jobs:
            **/requirements*.txt
            .pre-commit-config.yaml

-      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd #v3.0.1
+      - uses: pre-commit/action@v3.0.1
+
+      - name: Verify if there are any diff files after pre-commit
+        run: |
+          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
--- a/.github/workflows/publish-to-docker.yml
+++ b/.github/workflows/publish-to-docker.yml
@ -1,148 +0,0 @@
-name: Docker Build and Publish
-
-on:
-  workflow_dispatch:
-    inputs:
-      version:
-        description: 'TestPyPI or PyPI version to build (e.g., 0.0.63.dev20250114)'
-        required: true
-        type: string
-
-jobs:
-  build-and-push:
-    runs-on: ubuntu-latest
-    env:
-      TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
-      FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
-      TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
-    permissions:
-      contents: read
-      packages: write
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log in to the Container registry
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-
-      - name: Set version
-        id: version
-        run: |
-          if [ "${{ github.event_name }}" = "push" ]; then
-            echo "VERSION=0.0.63.dev51206766" >> $GITHUB_OUTPUT
-          else
-            echo "VERSION=${{ inputs.version }}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Check package version availability
-        run: |
-            # Function to check if version exists in a repository
-            check_version() {
-                local repo=$1
-                local VERSION_TO_CHECK=${{ steps.version.outputs.version }}
-                echo "Checking version $VERSION_TO_CHECK in $repo"
-                result=$(curl -s "https://$repo.org/pypi/llama-stack/json" | jq --arg v "$VERSION_TO_CHECK" '.releases | has($v)')
-                echo "Result: $result"
-                return $([ "$result" = "true" ])
-            }
-
-            # Check TestPyPI first, then PyPI
-            if check_version "test.pypi"; then
-                echo "Version ${{ steps.version.outputs.version }} found in TestPyPI"
-                echo "PYPI_SOURCE=testpypi" >> $GITHUB_ENV
-            elif check_version "pypi"; then
-                echo "Version ${{ steps.version.outputs.version }} found in PyPI"
-                echo "PYPI_SOURCE=pypi" >> $GITHUB_ENV
-            else
-                echo "Error: Version ${{ steps.version.outputs.version }} not found in either TestPyPI or PyPI"
-                exit 1
-            fi
-
-      - name: Install llama-stack
-        run: |
-            echo "PYPI_SOURCE=${PYPI_SOURCE}"
-            if [ "${{ github.event_name }}" = "push" ]; then
-                pip install -e .
-            else
-                if [ "$PYPI_SOURCE" = "testpypi" ]; then
-                    pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple llama-stack==${{ steps.version.outputs.version }}
-                else
-                    pip install llama-stack==${{ steps.version.outputs.version }}
-                fi
-            fi
-
-      - name: Build docker image
-        run: |
-          echo "PYPI_SOURCE=${PYPI_SOURCE}"
-          echo "VERSION=${{ steps.version.outputs.version }}"
-          TEMPLATES=("ollama" "bedrock" "remote-vllm" "fireworks" "together" "tgi" "meta-reference-gpu")
-          for template in "${TEMPLATES[@]}"; do
-            if [ "$PYPI_SOURCE" = "testpypi" ]; then
-                TEST_PYPI_VERSION=${{ steps.version.outputs.version }} llama stack build --template $template --image-type container
-            else
-                PYPI_VERSION=${{ steps.version.outputs.version }} llama stack build --template $template --image-type container
-            fi
-          done
-
-      - name: List docker images
-        run: |
-          docker images
-
-      # TODO (xiyan): make the following 2 steps into a matrix and test all templates other than fireworks
-      - name: Start up built docker image
-        run: |
-          cd distributions/fireworks
-          if [ "$PYPI_SOURCE" = "testpypi" ]; then
-            sed -i 's|image: llamastack/distribution-fireworks|image: distribution-fireworks:test-${{ steps.version.outputs.version }}|' ./compose.yaml
-          else
-            sed -i 's|image: llamastack/distribution-fireworks|image: distribution-fireworks:${{ steps.version.outputs.version }}|' ./compose.yaml
-          fi
-          docker compose up -d
-          cd ..
-          # Wait for the container to start
-          timeout=300
-          while ! curl -s -f http://localhost:8321/v1/version > /dev/null && [ $timeout -gt 0 ]; do
-            echo "Waiting for endpoint to be available..."
-            sleep 5
-            timeout=$((timeout - 5))
-          done
-
-          if [ $timeout -le 0 ]; then
-            echo "Timeout waiting for endpoint to become available"
-            exit 1
-          fi
-
-      - name: Run simple models list test on docker server
-        run: |
-          curl http://localhost:8321/v1/models
-
-      # TODO (xiyan): figure out why client cannot find server but curl works
-      # - name: Run pytest on docker server
-      #   run: |
-      #     pip install pytest pytest-md-report
-      #     export LLAMA_STACK_BASE_URL="http://localhost:8321"
-      #     LLAMA_STACK_BASE_URL="http://localhost:8321" pytest -v tests/client-sdk/inference/test_inference.py --md-report --md-report-verbose=1
-
-      - name: Push to dockerhub
-        run: |
-          echo "PYPI_SOURCE=${PYPI_SOURCE}"
-          echo "VERSION=${{ steps.version.outputs.version }}"
-          TEMPLATES=("ollama" "bedrock" "remote-vllm" "fireworks" "together" "tgi" "meta-reference-gpu")
-          for template in "${TEMPLATES[@]}"; do
-            if [ "$PYPI_SOURCE" = "testpypi" ]; then
-                docker tag distribution-$template:test-${{ steps.version.outputs.version }} llamastack/distribution-$template:test-${{ steps.version.outputs.version }}
-                docker push llamastack/distribution-$template:test-${{ steps.version.outputs.version }}
-            else
-                docker tag distribution-$template:${{ steps.version.outputs.version }} llamastack/distribution-$template:${{ steps.version.outputs.version }}
-                docker tag distribution-$template:${{ steps.version.outputs.version }} llamastack/distribution-$template:latest
-                docker push llamastack/distribution-$template:${{ steps.version.outputs.version }}
-                docker push llamastack/distribution-$template:latest
-            fi
-          done
--- a/.github/workflows/publish-to-test-pypi.yml
+++ b/.github/workflows/publish-to-test-pypi.yml
@ -1,244 +0,0 @@
-name: Publish Python 🐍 distribution 📦 to TestPyPI
-
-on:
-  workflow_dispatch:  # Keep manual trigger
-    inputs:
-      version:
-        description: 'Version number (e.g. 0.0.63.dev20250111)'
-        required: true
-        type: string
-  schedule:
-    - cron: "0 0 * * *"  # Run every day at midnight
-
-jobs:
-  trigger-client-and-models-build:
-    name: Trigger llama-stack-client and llama-models build
-    runs-on: ubuntu-latest
-    outputs:
-      version: ${{ steps.version.outputs.version }}
-      client_run_id: ${{ steps.trigger-client.outputs.workflow_id }}
-      model_run_id: ${{ steps.trigger-models.outputs.workflow_id }}
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        persist-credentials: false
-    - name: Get date
-      id: date
-      run: echo "date=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
-    - name: Compute version based on dispatch event
-      id: version
-      run: |
-        # Read base version from pyproject.toml
-        version=$(sed -n 's/.*version="\([^"]*\)".*/\1/p' setup.py)
-        if [ "${{ github.event_name }}" = "schedule" ]; then
-          echo "version=${version}.dev${{ steps.date.outputs.date }}" >> $GITHUB_OUTPUT
-        elif [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
-          echo "version=${{ inputs.version }}" >> $GITHUB_OUTPUT
-        else
-          echo "version=${version}.dev$(shuf -i 10000000-99999999 -n 1)" >> $GITHUB_OUTPUT
-        fi
-    - name: Trigger llama-stack-client workflow
-      id: trigger-client
-      run: |
-        response=$(curl -X POST https://api.github.com/repos/meta-llama/llama-stack-client-python/dispatches \
-        -H 'Accept: application/vnd.github.everest-preview+json' \
-        -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
-        --data "{\"event_type\": \"build-client-package\", \"client_payload\": {\"source\": \"llama-stack-nightly\", \"version\": \"${{ steps.version.outputs.version }}\"}}" \
-        -w "\n%{http_code}")
-
-        http_code=$(echo "$response" | tail -n1)
-        if [ "$http_code" != "204" ]; then
-          echo "Failed to trigger client workflow"
-          exit 1
-        fi
-
-        # Get the run ID of the triggered workflow
-        sleep 5  # Wait for workflow to be created
-        run_id=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
-                 "https://api.github.com/repos/meta-llama/llama-stack-client-python/actions/runs?event=repository_dispatch" \
-                 | jq '.workflow_runs[0].id')
-        echo "workflow_id=$run_id" >> $GITHUB_OUTPUT
-
-    - name: Trigger llama-models workflow
-      id: trigger-models
-      run: |
-        response=$(curl -X POST https://api.github.com/repos/meta-llama/llama-models/dispatches \
-        -H 'Accept: application/vnd.github.everest-preview+json' \
-        -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
-        --data "{\"event_type\": \"build-models-package\", \"client_payload\": {\"source\": \"llama-stack-nightly\", \"version\": \"${{ steps.version.outputs.version }}\"}}" \
-        -w "\n%{http_code}")
-
-        http_code=$(echo "$response" | tail -n1)
-        if [ "$http_code" != "204" ]; then
-          echo "Failed to trigger models workflow"
-          exit 1
-        fi
-
-        # Get the run ID of the triggered workflow
-        sleep 5  # Wait for workflow to be created
-        run_id=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
-                 "https://api.github.com/repos/meta-llama/llama-models/actions/runs?event=repository_dispatch" \
-                 | jq '.workflow_runs[0].id')
-        echo "workflow_id=$run_id" >> $GITHUB_OUTPUT
-
-  wait-for-workflows:
-    name: Wait for triggered workflows
-    needs: trigger-client-and-models-build
-    runs-on: ubuntu-latest
-    steps:
-    - name: Wait for client workflow
-      run: |
-        while true; do
-          status=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
-                   "https://api.github.com/repos/meta-llama/llama-stack-client-python/actions/runs/${{ needs.trigger-client-and-models-build.outputs.client_run_id }}" \
-                   | jq -r '.status')
-          conclusion=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
-                      "https://api.github.com/repos/meta-llama/llama-stack-client-python/actions/runs/${{ needs.trigger-client-and-models-build.outputs.client_run_id }}" \
-                      | jq -r '.conclusion')
-
-          echo "llama-stack-client-python workflow status: $status, conclusion: $conclusion"
-
-          if [ "$status" = "completed" ]; then
-            if [ "$conclusion" != "success" ]; then
-              echo "llama-stack-client-python workflow failed"
-              exit 1
-            fi
-            break
-          fi
-
-          sleep 10
-        done
-
-    - name: Wait for models workflow
-      run: |
-        while true; do
-          status=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
-                   "https://api.github.com/repos/meta-llama/llama-models/actions/runs/${{ needs.trigger-client-and-models-build.outputs.model_run_id }}" \
-                   | jq -r '.status')
-          conclusion=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
-                      "https://api.github.com/repos/meta-llama/llama-models/actions/runs/${{ needs.trigger-client-and-models-build.outputs.model_run_id }}" \
-                      | jq -r '.conclusion')
-
-          echo "llama-models workflow status: $status, conclusion: $conclusion"
-
-          if [ "$status" = "completed" ]; then
-            if [ "$conclusion" != "success" ]; then
-              echo "llama-models workflow failed"
-              exit 1
-            fi
-            break
-          fi
-
-          sleep 10
-        done
-
-  build:
-    name: Build distribution 📦
-    needs:
-      - wait-for-workflows
-      - trigger-client-and-models-build
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        persist-credentials: false
-    - name: Get date
-      id: date
-      run: echo "date=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
-    - name: Update version for nightly
-      run: |
-        sed -i 's/version="\([^"]*\)"/version="${{ needs.trigger-client-and-models-build.outputs.version }}"/' setup.py
-        sed -i 's/llama-stack-client>=\([^"]*\)/llama-stack-client==${{ needs.trigger-client-and-models-build.outputs.version }}/' requirements.txt
-        sed -i 's/llama-models>=\([^"]*\)/llama-models==${{ needs.trigger-client-and-models-build.outputs.version }}/' requirements.txt
-    - name: Set up Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: "3.11"
-    - name: Install pypa/build
-      run: >-
-        python3 -m
-        pip install
-        build
-        --user
-    - name: Build a binary wheel and a source tarball
-      run: python3 -m build
-    - name: Store the distribution packages
-      uses: actions/upload-artifact@v4
-      with:
-        name: python-package-distributions
-        path: dist/
-
-  publish-to-testpypi:
-    name: Publish Python 🐍 distribution 📦 to TestPyPI
-    needs:
-    - build
-    runs-on: ubuntu-latest
-
-    environment:
-      name: testrelease
-      url: https://test.pypi.org/p/llama-stack
-
-    permissions:
-      id-token: write  # IMPORTANT: mandatory for trusted publishing
-
-    steps:
-    - name: Download all the dists
-      uses: actions/download-artifact@v4
-      with:
-        name: python-package-distributions
-        path: dist/
-    - name: Publish distribution 📦 to TestPyPI
-      uses: pypa/gh-action-pypi-publish@release/v1
-      with:
-        repository-url: https://test.pypi.org/legacy/
-
-  test-published-package:
-    name: Test published package
-    needs:
-      - publish-to-testpypi
-      - trigger-client-and-models-build
-    runs-on: ubuntu-latest
-    env:
-      TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
-      TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        persist-credentials: false
-    - name: Install the package
-      run: |
-        max_attempts=6
-        attempt=1
-        while [ $attempt -le $max_attempts ]; do
-          echo "Attempt $attempt of $max_attempts to install package..."
-          if pip install --no-cache --index-url https://pypi.org/simple/ --extra-index-url https://test.pypi.org/simple/ llama-stack==${{ needs.trigger-client-and-models-build.outputs.version }}; then
-            echo "Package installed successfully"
-            break
-          fi
-          if [ $attempt -ge $max_attempts ]; then
-            echo "Failed to install package after $max_attempts attempts"
-            exit 1
-          fi
-          attempt=$((attempt + 1))
-          sleep 10
-        done
-    - name: Test the package versions
-      run: |
-        pip list | grep llama_
-    - name: Test CLI commands
-      run: |
-        llama model list
-        llama stack build --list-templates
-        llama model prompt-format -m Llama3.2-11B-Vision-Instruct
-        llama stack list-apis
-        llama stack list-providers inference
-        llama stack list-providers telemetry
-    - name: Test Notebook
-      run: |
-        pip install pytest nbval
-        llama stack build --template together --image-type venv
-        pytest -v -s --nbval-lax ./docs/getting_started.ipynb
-        pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
-
-    # TODO: add trigger for integration test workflow & docker builds
--- a/.github/workflows/semantic-pr.yml
+++ b/.github/workflows/semantic-pr.yml
@ -0,0 +1,21 @@
+name: Check semantic PR titles
+
+on:
+  pull_request_target:
+    types:
+      - opened
+      - edited
+      - reopened
+      - synchronize
+
+permissions:
+  contents: read
+
+jobs:
+  title-check:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check PR Title's semantic conformance
+        uses: amannn/action-semantic-pull-request@v5
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -0,0 +1,69 @@
+name: auto-tests
+
+on:
+  # pull_request:
+  workflow_dispatch:
+    inputs:
+      commit_sha:
+        description: 'Specific Commit SHA to trigger on'
+        required: false
+        default: $GITHUB_SHA # default to the last commit of $GITHUB_REF branch
+
+jobs:
+  test-llama-stack-as-library:
+    runs-on: ubuntu-latest
+    env:
+      TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
+      FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
+      TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
+    strategy:
+      matrix:
+        provider: [fireworks, together]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.commit_sha }}
+
+      - name: Echo commit SHA
+        run: |
+          echo "Triggered on commit SHA: ${{ github.event.inputs.commit_sha }}"
+          git rev-parse HEAD
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt pytest
+          pip install -e .
+
+      - name: Build providers
+        run: |
+          llama stack build --template ${{ matrix.provider }} --image-type venv
+
+      - name: Install the latest llama-stack-client & llama-models packages
+        run: |
+          pip install -e git+https://github.com/meta-llama/llama-stack-client-python.git#egg=llama-stack-client
+          pip install -e git+https://github.com/meta-llama/llama-models.git#egg=llama-models
+
+      - name: Run client-sdk test
+        working-directory: "${{ github.workspace }}"
+        env:
+          REPORT_OUTPUT: md_report.md
+        shell: bash
+        run: |
+          pip install --upgrade pytest-md-report
+          echo "REPORT_FILE=${REPORT_OUTPUT}" >> "$GITHUB_ENV"
+
+          export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
+          LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/ --md-report-output "$REPORT_OUTPUT"
+
+      - name: Output reports to the job summary
+        if: always()
+        shell: bash
+        run: |
+          if [ -f "$REPORT_FILE" ]; then
+            echo "<details><summary> Test Report for ${{ matrix.provider }} </summary>" >> $GITHUB_STEP_SUMMARY
+            echo "" >> $GITHUB_STEP_SUMMARY
+            cat "$REPORT_FILE" >> $GITHUB_STEP_SUMMARY
+            echo "" >> $GITHUB_STEP_SUMMARY
+            echo "</details>" >> $GITHUB_STEP_SUMMARY
+          fi
--- a/.github/workflows/update-readthedocs.yml
+++ b/.github/workflows/update-readthedocs.yml
@ -0,0 +1,40 @@
+name: Update ReadTheDocs
+
+on:
+  workflow_dispatch:
+    inputs:
+      branch:
+        description: 'RTD version to update'
+        required: false
+        default: 'latest'
+  push:
+    branches:
+      - main
+    paths:
+      - 'docs/source/**'
+      - 'docs/resources/**'
+      - '.github/workflows/update-readthedocs.yml'
+
+jobs:
+  update-readthedocs:
+    runs-on: ubuntu-latest    
+    env:
+      TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
+    steps:
+      - name: Trigger ReadTheDocs build
+        run: |
+          if [ -z "$TOKEN" ]; then
+            echo "READTHEDOCS_TOKEN is not set"
+            exit 1
+          fi
+
+          response=$(curl -X POST \
+            -H "Content-Type: application/json" \
+            -d "{\"token\": \"$TOKEN\"}" \
+            https://readthedocs.org/api/v2/webhook/llama-stack/289768/)
+
+          echo "Response: $response"
+          if [ $(echo $response | jq -r '.build_triggered') != 'true' ]; then
+            echo "Failed to trigger ReadTheDocs build"
+            exit 1
+          fi
--- a/.gitignore
+++ b/.gitignore
@ -19,3 +19,4 @@ Package.resolved
 _build
 docs/src
 pyrightconfig.json
+venv/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -5,10 +5,8 @@ default_language_version:

 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: 6306a48f7dae5861702d573c9c247e4e9498e867
+    rev: v5.0.0  # Latest stable version
    hooks:
-    -   id: trailing-whitespace
-    -   id: check-ast
    -   id: check-merge-conflict
    -   id: check-added-large-files
        args: ['--maxkb=1000']
@ -28,23 +26,41 @@ repos:
          - --license-filepath
          - docs/license_header.txt

-   repo: https://github.com/pycqa/flake8
-    rev: 34cbf8ef3950f43d09b85e2e45c15ae5717dc37b
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.9.4
    hooks:
-    -   id: flake8
-        additional_dependencies:
-          - flake8-bugbear == 22.4.25
-          - pep8-naming == 0.12.1
-          - torchfix
-        args: ['--config=.flake8']
+    # Run the linter with import sorting.
+    -   id: ruff
+        args: [
+            --fix,
+            --exit-non-zero-on-fix,
+            --select, I,
+        ]
+    -   id: ruff-format

-   repo: https://github.com/omnilib/ufmt
-    rev: v2.7.0
+-   repo: https://github.com/adamchainz/blacken-docs
+    rev: 1.19.0
    hooks:
-    -   id: ufmt
+    -   id: blacken-docs
        additional_dependencies:
-          - black == 24.4.2
-          - usort == 1.0.8
+        - black==24.3.0
+
+-   repo: https://github.com/astral-sh/uv-pre-commit
+    rev: 0.5.26
+    hooks:
+    -   id: uv-export
+        args: ["--frozen", "--no-hashes", "--no-emit-project"]
+    -   id: uv-sync
+
+# -   repo: https://github.com/pre-commit/mirrors-mypy
+#     rev: v1.14.0
+#     hooks:
+#     -   id: mypy
+#         additional_dependencies:
+#           - types-requests
+#           - types-setuptools
+#           - pydantic
+#         args: [--ignore-missing-imports]

 # - repo: https://github.com/jsh9/pydoclint
 #   rev: d88180a8632bb1602a4d81344085cf320f288c5a
@ -71,3 +87,7 @@ repos:
 #         require_serial: true
 #         files: ^llama_stack/templates/.*$
 #         stages: [manual]
+
+ci:
+    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
+    autoupdate_commit_msg: ⬆ [pre-commit.ci] pre-commit autoupdate
--- a/.ruff.toml
+++ b/.ruff.toml
@ -1,7 +1,8 @@
-[flake8]
 # Suggested config from pytorch that we can adapt
-select = B,C,E,F,N,P,T4,W,B9,TOR0,TOR1,TOR2
-max-line-length = 120
+lint.select = ["B", "C", "E" , "F" , "N", "W", "B9"]
+
+line-length = 120
+
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
 # N812 ignored because import torch.nn.functional as F is PyTorch convention
@ -9,23 +10,28 @@ max-line-length = 120
 # E731 allow usage of assigning lambda expressions
 # E701 let black auto-format statements on one line
 # E704 let black auto-format statements on one line
-ignore =
-    E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,N812,N817,E731,E701,E704
+lint.ignore = [
+    "E203", "E305", "E402", "E501", "E721", "E741", "F405", "F821", "F841",
+    "C408", "E302", "W291", "E303", "N812", "N817", "E731", "E701",
+    # These are the additional ones we started ignoring after moving to ruff. We should look into each one of them later.
+    "C901", "C405", "C414", "N803", "N999", "C403", "C416", "B028", "C419", "C401", "B023",
    # shebang has extra meaning in fbcode lints, so I think it's not worth trying
    # to line this up with executable bit
-    EXE001,
+    "EXE001",
    # random naming hints don't need
-    N802,
+    "N802",
    # these ignores are from flake8-bugbear; please fix!
-    B007,B008,B950
-optional-ascii-coding = True
-exclude =
-    ./.git,
-    ./docs/*,
-    ./build,
-    ./scripts,
-    ./venv,
-    *.pyi,
-    .pre-commit-config.yaml,
-    *.md,
-    .flake8
+    "B007", "B008"
+]
+
+exclude = [
+    "./.git",
+    "./docs/*",
+    "./build",
+    "./scripts",
+    "./venv",
+    "*.pyi",
+    ".pre-commit-config.yaml",
+    "*.md",
+    ".flake8"
+]
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,35 +0,0 @@
-# Changelog
-
-## 0.0.53
-
-### Added
- Resource-oriented design for models, shields, memory banks, datasets and eval tasks
- Persistence for registered objects with distribution
- Ability to persist memory banks created for FAISS
- PostgreSQL KVStore implementation
- Environment variable placeholder support in run.yaml files
- Comprehensive Zero-to-Hero notebooks and quickstart guides
- Support for quantized models in Ollama
- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM
- Bedrock distribution with safety shields support
- Evals API with task registration and scoring functions
- MMLU and SimpleQA benchmark scoring functions
- Huggingface dataset provider integration for benchmarks
- Support for custom dataset registration from local paths
- Benchmark evaluation CLI tools with visualization tables
- RAG evaluation scoring functions and metrics
- Local persistence for datasets and eval tasks
-
-### Changed
- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner)
- Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`)
- Updated API signatures for dataset and eval task registration
- Restructured folder organization for providers
- Enhanced Docker build configuration
- Added version prefixing for REST API routes
- Enhanced evaluation task registration workflow
- Improved benchmark evaluation output formatting
- Restructured evals folder organization for better modularity
-
-### Removed
- `llama stack configure` command
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -40,6 +40,7 @@ If you need help or guidance, comment on the issue. Issues that are extra friend
 3. Ensure the test suite passes.
 4. Make sure your code lints using `pre-commit`.
 5. If you haven't already, complete the Contributor License Agreement ("CLA").
+6. Ensure your pull request follows the [conventional commits format](https://www.conventionalcommits.org/en/v1.0.0/).

 ## Contributor License Agreement ("CLA")
 In order to accept your pull request, we need you to submit a CLA. You only need
@ -56,22 +57,50 @@ disclosure of security bugs. In those cases, please go through the process
 outlined on that page and do not file a public issue.


+## Set up your development environment
+
+We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
+You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
+You can install the dependencies by running:
+
+```bash
+$ cd llama-stack
+$ uv sync --extra dev
+$ uv pip install -e .
+$ source .venv/bin/activate
+```
+
 ## Pre-commit Hooks

 We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:

 ```bash
-$ cd llama-stack
-$ conda activate <your-environment>
-$ pip install pre-commit
-$ pre-commit install
+$ uv run pre-commit install
 ```

 After that, pre-commit hooks will run automatically before each commit.

+Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
+
+```bash
+$ uv run pre-commit run --all-files
+```
+
+> [!CAUTION]
+> Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
+
+## Adding a new dependency to the project
+
+To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:
+
+```bash
+$ uv add foo
+$ uv sync
+```

 ## Coding Style
-* 2 spaces for indentation rather than tabs
+
+* 4 spaces for indentation rather than tabs
 * 80 character line length
 * ...

@ -102,13 +131,12 @@ If you have made changes to a provider's configuration in any form (introducing
 If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.

 ```bash
-cd llama-stack/docs
-pip install -r requirements.txt
-pip install sphinx-autobuild
+$ cd llama-stack/docs
+$ uv sync --extra docs

 # This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
-make html
-sphinx-autobuild source build/html
+$ make html
+$ uv run sphinx-autobuild source build/html
 ```


--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,4 +1,4 @@
-include requirements.txt
+include pyproject.toml
 include distributions/dependencies.json
 include llama_stack/distribution/*.sh
 include llama_stack/cli/scripts/*.sh
--- a/README.md
+++ b/README.md
@ -2,17 +2,18 @@

 [![PyPI version](https://img.shields.io/pypi/v/llama_stack.svg)](https://pypi.org/project/llama_stack/)
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/)
+[![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE)
 [![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack)

 [**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb)

-Llama Stack defines and standardizes the core building blocks that simplify AI application development. It codified best practices across the Llama ecosystem. More specifically, it provides
+Llama Stack standardizes the core building blocks that simplify AI application development. It codifies best practices across the Llama ecosystem. More specifically, it provides

 - **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
- **Plugin architecture** to support the rich ecosystem of implementations of the different APIs in different environments like local development, on-premises, cloud, and mobile.
- **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment
- **Multiple developer interfaces** like CLI and SDKs for Python, Node, iOS, and Android
- **Standalone applications** as examples for how to build production-grade AI applications with Llama Stack
+- **Plugin architecture** to support the rich ecosystem of different API implementations in various environments, including local development, on-premises, cloud, and mobile.
+- **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment.
+- **Multiple developer interfaces** like CLI and SDKs for Python, Typescript, iOS, and Android.
+- **Standalone applications** as examples for how to build production-grade AI applications with Llama Stack.

 <div style="text-align: center;">
  <img
@ -24,31 +25,31 @@ Llama Stack defines and standardizes the core building blocks that simplify AI a
 </div>

 ### Llama Stack Benefits
- **Flexible Options**: Developers can choose their preferred infrastructure without changing APIs and enjoy flexible deployment choice.
- **Consistent Experience**: With its unified APIs Llama Stack makes it easier to build, test, and deploy AI applications with consistent application behavior.
+- **Flexible Options**: Developers can choose their preferred infrastructure without changing APIs and enjoy flexible deployment choices.
+- **Consistent Experience**: With its unified APIs, Llama Stack makes it easier to build, test, and deploy AI applications with consistent application behavior.
 - **Robust Ecosystem**: Llama Stack is already integrated with distribution partners (cloud providers, hardware vendors, and AI-focused companies) that offer tailored infrastructure, software, and services for deploying Llama models.

 By reducing friction and complexity, Llama Stack empowers developers to focus on what they do best: building transformative generative AI applications.

 ### API Providers
-Here is a list of the various API providers and available distributions to developers started easily,
+Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack. 

-|                                  **API Provider Builder**                                  |    **Environments**    |     **Agents**     |   **Inference**    |     **Memory**     |     **Safety**     |   **Telemetry**    |
-|:------------------------------------------------------------------------------------------:|:----------------------:|:------------------:|:------------------:|:------------------:|:------------------:|:------------------:|
-|                                       Meta Reference                                       |      Single Node       | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
-|                                          SambaNova                                         |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
-|                                          Cerebras                                          |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
-|                                         Fireworks                                          |         Hosted         | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |                    |                    |
-|                                        AWS Bedrock                                         |         Hosted         |                    | :heavy_check_mark: |                    | :heavy_check_mark: |                    |
-|                                          Together                                          |         Hosted         | :heavy_check_mark: | :heavy_check_mark: |                    | :heavy_check_mark: |                    |
-|                                            Groq                                            |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
-|                                           Ollama                                           |      Single Node       |                    | :heavy_check_mark: |                    |                    |                    |
-|                                            TGI                                             | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
-| NVIDIA NIM | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
-|                                           Chroma                                           |      Single Node       |                    |                    | :heavy_check_mark: |                    |                    |
-|                                         PG Vector                                          |      Single Node       |                    |                    | :heavy_check_mark: |                    |                    |
-|                                     PyTorch ExecuTorch                                     |     On-device iOS      | :heavy_check_mark: | :heavy_check_mark: |                    |                    |                    |
-|                        vLLM                        | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
+| **API Provider Builder** |    **Environments**    | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
+|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|
+|      Meta Reference      |      Single Node       |     ✅      |       ✅       |     ✅      |     ✅      |       ✅       |
+|        SambaNova         |         Hosted         |            |       ✅       |            |            |               |
+|         Cerebras         |         Hosted         |            |       ✅       |            |            |               |
+|        Fireworks         |         Hosted         |     ✅      |       ✅       |     ✅      |            |               |
+|       AWS Bedrock        |         Hosted         |            |       ✅       |            |     ✅      |               |
+|         Together         |         Hosted         |     ✅      |       ✅       |            |     ✅      |               |
+|           Groq           |         Hosted         |            |       ✅       |            |            |               |
+|          Ollama          |      Single Node       |            |       ✅       |            |            |               |
+|           TGI            | Hosted and Single Node |            |       ✅       |            |            |               |
+|        NVIDIA NIM        | Hosted and Single Node |            |       ✅       |            |            |               |
+|          Chroma          |      Single Node       |            |               |     ✅      |            |               |
+|        PG Vector         |      Single Node       |            |               |     ✅      |            |               |
+|    PyTorch ExecuTorch    |     On-device iOS      |     ✅      |       ✅       |            |            |               |
+|           vLLM           | Hosted and Single Node |            |       ✅       |            |            |               |

 ### Distributions

@ -70,15 +71,15 @@ A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider

 You have two ways to install this repository:

-1. **Install as a package**:
+* **Install as a package**:
   You can install the repository directly from [PyPI](https://pypi.org/project/llama-stack/) by running the following command:
   ```bash
   pip install llama-stack
   ```

-2. **Install from source**:
+* **Install from source**:
   If you prefer to install from the source code, make sure you have [conda installed](https://docs.conda.io/projects/conda/en/stable).
-   Then, follow these steps:
+   Then, run the following commands:
   ```bash
    mkdir -p ~/local
    cd ~/local
@ -95,10 +96,11 @@ You have two ways to install this repository:

 Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest/index.html) page for more details.

-* [CLI reference](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/index.html)
-    * Guide using `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
-* [Getting Started](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html)
-    * Quick guide to start a Llama Stack server.
+* CLI references
+    * [llama (server-side) CLI Reference](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/index.html): Guide for using the `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
+    * [llama (client-side) CLI Reference](https://llama-stack.readthedocs.io/en/latest/references/llama_stack_client_cli_reference.html): Guide for using the `llama-stack-client` CLI, which allows you to query information about the distribution.
+* Getting Started
+    * [Quick guide to start a Llama Stack server](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).
    * [Jupyter notebook](./docs/getting_started.ipynb) to walk-through how to use simple text and vision inference llama_stack_client APIs
    * The complete Llama Stack lesson [Colab notebook](https://colab.research.google.com/drive/1dtVmxotBsI4cGZQNsJRYPrLiDeT0Wnwt) of the new [Llama 3.2 course on Deeplearning.ai](https://learn.deeplearning.ai/courses/introducing-multimodal-llama-3-2/lesson/8/llama-stack).
    * A [Zero-to-Hero Guide](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) that guide you through all the key components of llama stack with code samples.
@ -111,9 +113,9 @@ Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest
 | :----: | :----: | :----: |
 | Python |  [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/)
 | Swift  | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
-| Node   | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
+| Typescript   | [llama-stack-client-typescript](https://github.com/meta-llama/llama-stack-client-typescript) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
 | Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)

-Check out our client SDKs for connecting to Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [node](https://github.com/meta-llama/llama-stack-client-node), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
+Check out our client SDKs for connecting to a Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [typescript](https://github.com/meta-llama/llama-stack-client-typescript), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.

 You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@ -1,9 +1,46 @@
 {
-  "sambanova": [
+  "bedrock": [
    "aiosqlite",
+    "autoevals",
    "blobfile",
+    "boto3",
    "chardet",
    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "cerebras": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "cerebras_cloud_sdk",
+    "chardet",
+    "chromadb-client",
+    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
@ -27,7 +64,110 @@
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
-    "torch --index-url https://download.pytorch.org/whl/cpu"
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "dell": [
+    "aiohttp",
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "huggingface_hub",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "fireworks": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "fireworks-ai",
+    "httpx",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "hf-endpoint": [
+    "aiohttp",
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "huggingface_hub",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "hf-serverless": [
    "aiohttp",
@ -62,211 +202,7 @@
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
-    "torch --index-url https://download.pytorch.org/whl/cpu"
-  ],
-  "together": [
-    "aiosqlite",
-    "autoevals",
-    "blobfile",
-    "chardet",
-    "chromadb-client",
-    "datasets",
-    "faiss-cpu",
-    "fastapi",
-    "fire",
-    "httpx",
-    "matplotlib",
-    "mcp",
-    "nltk",
-    "numpy",
-    "openai",
-    "opentelemetry-exporter-otlp-proto-http",
-    "opentelemetry-sdk",
-    "pandas",
-    "pillow",
-    "psycopg2-binary",
-    "pypdf",
-    "redis",
-    "requests",
-    "scikit-learn",
-    "scipy",
-    "sentencepiece",
-    "together",
-    "tqdm",
-    "transformers",
-    "uvicorn",
-    "sentence-transformers --no-deps",
-    "torch --index-url https://download.pytorch.org/whl/cpu"
-  ],
-  "vllm-gpu": [
-    "aiosqlite",
-    "autoevals",
-    "blobfile",
-    "chardet",
-    "chromadb-client",
-    "datasets",
-    "faiss-cpu",
-    "fastapi",
-    "fire",
-    "httpx",
-    "matplotlib",
-    "mcp",
-    "nltk",
-    "numpy",
-    "openai",
-    "opentelemetry-exporter-otlp-proto-http",
-    "opentelemetry-sdk",
-    "pandas",
-    "pillow",
-    "psycopg2-binary",
-    "pypdf",
-    "redis",
-    "requests",
-    "scikit-learn",
-    "scipy",
-    "sentencepiece",
-    "tqdm",
-    "transformers",
-    "uvicorn",
-    "vllm",
-    "sentence-transformers --no-deps",
-    "torch --index-url https://download.pytorch.org/whl/cpu"
-  ],
-  "remote-vllm": [
-    "aiosqlite",
-    "autoevals",
-    "blobfile",
-    "chardet",
-    "chromadb-client",
-    "datasets",
-    "faiss-cpu",
-    "fastapi",
-    "fire",
-    "httpx",
-    "matplotlib",
-    "mcp",
-    "nltk",
-    "numpy",
-    "openai",
-    "opentelemetry-exporter-otlp-proto-http",
-    "opentelemetry-sdk",
-    "pandas",
-    "pillow",
-    "psycopg2-binary",
-    "pypdf",
-    "redis",
-    "requests",
-    "scikit-learn",
-    "scipy",
-    "sentencepiece",
-    "tqdm",
-    "transformers",
-    "uvicorn",
-    "sentence-transformers --no-deps",
-    "torch --index-url https://download.pytorch.org/whl/cpu"
-  ],
-  "fireworks": [
-    "aiosqlite",
-    "autoevals",
-    "blobfile",
-    "chardet",
-    "chromadb-client",
-    "datasets",
-    "faiss-cpu",
-    "fastapi",
-    "fire",
-    "fireworks-ai",
-    "httpx",
-    "matplotlib",
-    "mcp",
-    "nltk",
-    "numpy",
-    "openai",
-    "opentelemetry-exporter-otlp-proto-http",
-    "opentelemetry-sdk",
-    "pandas",
-    "pillow",
-    "psycopg2-binary",
-    "pypdf",
-    "redis",
-    "requests",
-    "scikit-learn",
-    "scipy",
-    "sentencepiece",
-    "tqdm",
-    "transformers",
-    "uvicorn",
-    "sentence-transformers --no-deps",
-    "torch --index-url https://download.pytorch.org/whl/cpu"
-  ],
-  "tgi": [
-    "aiohttp",
-    "aiosqlite",
-    "autoevals",
-    "blobfile",
-    "chardet",
-    "chromadb-client",
-    "datasets",
-    "faiss-cpu",
-    "fastapi",
-    "fire",
-    "httpx",
-    "huggingface_hub",
-    "matplotlib",
-    "mcp",
-    "nltk",
-    "numpy",
-    "openai",
-    "opentelemetry-exporter-otlp-proto-http",
-    "opentelemetry-sdk",
-    "pandas",
-    "pillow",
-    "psycopg2-binary",
-    "pypdf",
-    "redis",
-    "requests",
-    "scikit-learn",
-    "scipy",
-    "sentencepiece",
-    "tqdm",
-    "transformers",
-    "uvicorn",
-    "sentence-transformers --no-deps",
-    "torch --index-url https://download.pytorch.org/whl/cpu"
-  ],
-  "bedrock": [
-    "aiosqlite",
-    "autoevals",
-    "blobfile",
-    "boto3",
-    "chardet",
-    "chromadb-client",
-    "datasets",
-    "faiss-cpu",
-    "fastapi",
-    "fire",
-    "httpx",
-    "matplotlib",
-    "mcp",
-    "nltk",
-    "numpy",
-    "openai",
-    "opentelemetry-exporter-otlp-proto-http",
-    "opentelemetry-sdk",
-    "pandas",
-    "pillow",
-    "psycopg2-binary",
-    "pypdf",
-    "redis",
-    "requests",
-    "scikit-learn",
-    "scipy",
-    "sentencepiece",
-    "tqdm",
-    "transformers",
-    "uvicorn",
-    "sentence-transformers --no-deps",
-    "torch --index-url https://download.pytorch.org/whl/cpu"
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "meta-reference-gpu": [
    "accelerate",
@ -306,39 +242,7 @@
    "uvicorn",
    "zmq",
    "sentence-transformers --no-deps",
-    "torch --index-url https://download.pytorch.org/whl/cpu"
-  ],
-  "nvidia": [
-    "aiosqlite",
-    "autoevals",
-    "blobfile",
-    "chardet",
-    "datasets",
-    "faiss-cpu",
-    "fastapi",
-    "fire",
-    "httpx",
-    "matplotlib",
-    "mcp",
-    "nltk",
-    "numpy",
-    "openai",
-    "opentelemetry-exporter-otlp-proto-http",
-    "opentelemetry-sdk",
-    "pandas",
-    "pillow",
-    "psycopg2-binary",
-    "pypdf",
-    "redis",
-    "requests",
-    "scikit-learn",
-    "scipy",
-    "sentencepiece",
-    "tqdm",
-    "transformers",
-    "uvicorn",
-    "sentence-transformers --no-deps",
-    "torch --index-url https://download.pytorch.org/whl/cpu"
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "meta-reference-quantized-gpu": [
    "accelerate",
@ -380,21 +284,20 @@
    "uvicorn",
    "zmq",
    "sentence-transformers --no-deps",
-    "torch --index-url https://download.pytorch.org/whl/cpu"
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
-  "cerebras": [
+  "nvidia": [
    "aiosqlite",
    "autoevals",
    "blobfile",
-    "cerebras_cloud_sdk",
    "chardet",
-    "chromadb-client",
    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
+    "mcp",
    "nltk",
    "numpy",
    "openai",
@ -413,7 +316,7 @@
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
-    "torch --index-url https://download.pytorch.org/whl/cpu"
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "ollama": [
    "aiohttp",
@ -447,9 +350,72 @@
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
-    "torch --index-url https://download.pytorch.org/whl/cpu"
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
-  "hf-endpoint": [
+  "remote-vllm": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "sambanova": [
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "tgi": [
    "aiohttp",
    "aiosqlite",
    "autoevals",
@ -482,6 +448,74 @@
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
-    "torch --index-url https://download.pytorch.org/whl/cpu"
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "together": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "together",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "vllm-gpu": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "vllm",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ]
 }
--- a/distributions/together/README.md
+++ b/distributions/together/README.md
@ -1,65 +0,0 @@
-# Together Distribution
-
-### Connect to a Llama Stack Together Endpoint
- You may connect to a hosted endpoint `https://llama-stack.together.ai`, serving a Llama Stack distribution
-
-The `llamastack/distribution-together` distribution consists of the following provider configurations.
-
-
-| **API**         	| **Inference** 	| **Agents**     	| **Memory**                                       	| **Safety**     	| **Telemetry**  	|
-|-----------------	|---------------	|----------------	|--------------------------------------------------	|----------------	|----------------	|
-| **Provider(s)** 	| remote::together   	| meta-reference 	| meta-reference, remote::weaviate 	| meta-reference 	| meta-reference 	|
-
-
-### Docker: Start the Distribution (Single Node CPU)
-
-> [!NOTE]
-> This assumes you have an hosted endpoint at Together with API Key.
-
-```
-$ cd distributions/together
-$ ls
-compose.yaml  run.yaml
-$ docker compose up
-```
-
-Make sure in you `run.yaml` file, you inference provider is pointing to the correct Together URL server endpoint. E.g.
-```
-inference:
-  - provider_id: together
-    provider_type: remote::together
-    config:
-      url: https://api.together.xyz/v1
-      api_key: <optional api key>
-```
-
-### Conda llama stack run (Single Node CPU)
-
-```bash
-llama stack build --template together --image-type conda
-# -- modify run.yaml to a valid Together server endpoint
-llama stack run ./run.yaml
-```
-
-### (Optional) Update Model Serving Configuration
-
-Use `llama-stack-client models list` to check the available models served by together.
-
-```
-$ llama-stack-client models list
-+------------------------------+------------------------------+---------------+------------+
-| identifier                   | llama_model                  | provider_id   | metadata   |
-+==============================+==============================+===============+============+
-| Llama3.1-8B-Instruct         | Llama3.1-8B-Instruct         | together0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.1-70B-Instruct        | Llama3.1-70B-Instruct        | together0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.1-405B-Instruct       | Llama3.1-405B-Instruct       | together0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-3B-Instruct         | Llama3.2-3B-Instruct         | together0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | together0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | together0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-```
--- a/docs/_static/css/my_theme.css
+++ b/docs/_static/css/my_theme.css
@ -12,3 +12,7 @@
 .wy-side-nav-search {
    background-color: transparent !important;
 }
+
+.hide-title h1 {
+    display: none;
+}
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
--- a/docs/conftest.py
+++ b/docs/conftest.py
@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+def pytest_collection_modifyitems(items):
+    for item in items:
+        item.name = item.name.replace(' ', '_') 
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@ -7,7 +7,7 @@
        "id": "c1e7571c"
      },
      "source": [
-        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1F2ksmkoGQPa4pzRjMOE6BXWeOxWFIW6n?usp=sharing)\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)\n",
        "\n",
        "# Llama Stack - Building AI Applications\n",
        "\n",
@ -15,7 +15,7 @@
        "\n",
        "[Llama Stack](https://github.com/meta-llama/llama-stack) defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Service Providers providing their implementations.\n",
        "\n",
-        "Read more about the project: https://llama-stack.readthedocs.io/en/latest/index.html\n",
+        "Read more about the project here: https://llama-stack.readthedocs.io/en/latest/index.html\n",
        "\n",
        "In this guide, we will showcase how you can build LLM-powered agentic applications using Llama Stack.\n"
      ]
@ -71,7 +71,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 1,
+      "execution_count": null,
      "id": "J2kGed0R5PSf",
      "metadata": {
        "colab": {
@ -81,119 +81,15 @@
        "id": "J2kGed0R5PSf",
        "outputId": "2478ea60-8d35-48a1-b011-f233831740c5"
      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Reading package lists... Done\n",
-            "Building dependency tree... Done\n",
-            "Reading state information... Done\n",
-            "The following NEW packages will be installed:\n",
-            "  bubblewrap\n",
-            "0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.\n",
-            "Need to get 46.3 kB of archives.\n",
-            "After this operation, 132 kB of additional disk space will be used.\n",
-            "Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 bubblewrap amd64 0.6.1-1ubuntu0.1 [46.3 kB]\n",
-            "Fetched 46.3 kB in 0s (122 kB/s)\n",
-            "Selecting previously unselected package bubblewrap.\n",
-            "(Reading database ... 124561 files and directories currently installed.)\n",
-            "Preparing to unpack .../bubblewrap_0.6.1-1ubuntu0.1_amd64.deb ...\n",
-            "Unpacking bubblewrap (0.6.1-1ubuntu0.1) ...\n",
-            "Setting up bubblewrap (0.6.1-1ubuntu0.1) ...\n",
-            "Processing triggers for man-db (2.10.2-1) ...\n",
-            "Looking in indexes: https://test.pypi.org/simple/, https://pypi.python.org/simple\n",
-            "Collecting llama-stack==0.1.0rc10\n",
-            "  Downloading https://test-files.pythonhosted.org/packages/68/22/4a170fbe01095df81e76c7bf8f35c716c1a0a5ec4503da6e78695fab351c/llama_stack-0.1.0rc10-py3-none-any.whl.metadata (15 kB)\n",
-            "Collecting blobfile (from llama-stack==0.1.0rc10)\n",
-            "  Downloading blobfile-3.0.0-py3-none-any.whl.metadata (15 kB)\n",
-            "Collecting fire (from llama-stack==0.1.0rc10)\n",
-            "  Downloading fire-0.7.0.tar.gz (87 kB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.2/87.2 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
-            "Requirement already satisfied: httpx in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (0.28.1)\n",
-            "Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (0.27.1)\n",
-            "Collecting llama-models==0.1.0rc10 (from llama-stack==0.1.0rc10)\n",
-            "  Downloading https://test-files.pythonhosted.org/packages/45/2b/6a6947d5915054b9980f82606942f1b79960a27168299254ca12e5b5795b/llama_models-0.1.0rc10-py3-none-any.whl.metadata (8.5 kB)\n",
-            "Collecting llama-stack-client==0.1.0rc10 (from llama-stack==0.1.0rc10)\n",
-            "  Downloading https://test-files.pythonhosted.org/packages/d6/85/a4fd621c4ae4db7339ab098b37bf4b4ad3cc12440e75ef10ec524e28ef7d/llama_stack_client-0.1.0rc10-py3-none-any.whl.metadata (15 kB)\n",
-            "Requirement already satisfied: prompt-toolkit in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (3.0.48)\n",
-            "Collecting python-dotenv (from llama-stack==0.1.0rc10)\n",
-            "  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)\n",
-            "Requirement already satisfied: pydantic>=2 in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (2.10.5)\n",
-            "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (2.32.3)\n",
-            "Requirement already satisfied: rich in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (13.9.4)\n",
-            "Requirement already satisfied: setuptools in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (75.1.0)\n",
-            "Requirement already satisfied: termcolor in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (2.5.0)\n",
-            "Requirement already satisfied: PyYAML in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack==0.1.0rc10) (6.0.2)\n",
-            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack==0.1.0rc10) (3.1.5)\n",
-            "Collecting tiktoken (from llama-models==0.1.0rc10->llama-stack==0.1.0rc10)\n",
-            "  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
-            "Requirement already satisfied: Pillow in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack==0.1.0rc10) (11.1.0)\n",
-            "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (3.7.1)\n",
-            "Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (8.1.8)\n",
-            "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (1.9.0)\n",
-            "Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (2.2.2)\n",
-            "Collecting pyaml (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10)\n",
-            "  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)\n",
-            "Requirement already satisfied: sniffio in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (1.3.1)\n",
-            "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (4.67.1)\n",
-            "Requirement already satisfied: typing-extensions<5,>=4.7 in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (4.12.2)\n",
-            "Requirement already satisfied: certifi in /usr/local/lib/python3.11/dist-packages (from httpx->llama-stack==0.1.0rc10) (2024.12.14)\n",
-            "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.11/dist-packages (from httpx->llama-stack==0.1.0rc10) (1.0.7)\n",
-            "Requirement already satisfied: idna in /usr/local/lib/python3.11/dist-packages (from httpx->llama-stack==0.1.0rc10) (3.10)\n",
-            "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.11/dist-packages (from httpcore==1.*->httpx->llama-stack==0.1.0rc10) (0.14.0)\n",
-            "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic>=2->llama-stack==0.1.0rc10) (0.7.0)\n",
-            "Requirement already satisfied: pydantic-core==2.27.2 in /usr/local/lib/python3.11/dist-packages (from pydantic>=2->llama-stack==0.1.0rc10) (2.27.2)\n",
-            "Collecting pycryptodomex>=3.8 (from blobfile->llama-stack==0.1.0rc10)\n",
-            "  Downloading pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)\n",
-            "Requirement already satisfied: urllib3<3,>=1.25.3 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack==0.1.0rc10) (2.3.0)\n",
-            "Requirement already satisfied: lxml>=4.9 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack==0.1.0rc10) (5.3.0)\n",
-            "Requirement already satisfied: filelock>=3.0 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack==0.1.0rc10) (3.16.1)\n",
-            "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub->llama-stack==0.1.0rc10) (2024.10.0)\n",
-            "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub->llama-stack==0.1.0rc10) (24.2)\n",
-            "Requirement already satisfied: wcwidth in /usr/local/lib/python3.11/dist-packages (from prompt-toolkit->llama-stack==0.1.0rc10) (0.2.13)\n",
-            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->llama-stack==0.1.0rc10) (3.4.1)\n",
-            "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich->llama-stack==0.1.0rc10) (3.0.0)\n",
-            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich->llama-stack==0.1.0rc10) (2.18.0)\n",
-            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich->llama-stack==0.1.0rc10) (0.1.2)\n",
-            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->llama-models==0.1.0rc10->llama-stack==0.1.0rc10) (3.0.2)\n",
-            "Requirement already satisfied: numpy>=1.23.2 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (1.26.4)\n",
-            "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (2.8.2)\n",
-            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (2024.2)\n",
-            "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (2024.2)\n",
-            "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.11/dist-packages (from tiktoken->llama-models==0.1.0rc10->llama-stack==0.1.0rc10) (2024.11.6)\n",
-            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas->llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (1.17.0)\n",
-            "Downloading https://test-files.pythonhosted.org/packages/68/22/4a170fbe01095df81e76c7bf8f35c716c1a0a5ec4503da6e78695fab351c/llama_stack-0.1.0rc10-py3-none-any.whl (532 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m532.7/532.7 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading https://test-files.pythonhosted.org/packages/45/2b/6a6947d5915054b9980f82606942f1b79960a27168299254ca12e5b5795b/llama_models-0.1.0rc10-py3-none-any.whl (1.6 MB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading https://test-files.pythonhosted.org/packages/d6/85/a4fd621c4ae4db7339ab098b37bf4b4ad3cc12440e75ef10ec524e28ef7d/llama_stack_client-0.1.0rc10-py3-none-any.whl (328 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m328.5/328.5 kB\u001b[0m \u001b[31m29.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading blobfile-3.0.0-py3-none-any.whl (75 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.4/75.4 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)\n",
-            "Downloading pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m57.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)\n",
-            "Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m64.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hBuilding wheels for collected packages: fire\n",
-            "  Building wheel for fire (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
-            "  Created wheel for fire: filename=fire-0.7.0-py3-none-any.whl size=114249 sha256=3a37285ecae37a5fb69bbad717aabdb8c13f0da7906668b7c123475eefa41c3b\n",
-            "  Stored in directory: /root/.cache/pip/wheels/46/54/24/1624fd5b8674eb1188623f7e8e17cdf7c0f6c24b609dfb8a89\n",
-            "Successfully built fire\n",
-            "Installing collected packages: python-dotenv, pycryptodomex, pyaml, fire, tiktoken, blobfile, llama-stack-client, llama-models, llama-stack\n",
-            "Successfully installed blobfile-3.0.0 fire-0.7.0 llama-models-0.1.0rc10 llama-stack-0.1.0rc10 llama-stack-client-0.1.0rc10 pyaml-25.1.0 pycryptodomex-3.21.0 python-dotenv-1.0.1 tiktoken-0.8.0\n"
-          ]
-        }
-      ],
+      "outputs": [],
      "source": [
        "# NBVAL_SKIP\n",
        "\n",
        "!apt-get install -y bubblewrap\n",
-        "# install a branch of llama stack\n",
-        "!pip install llama-stack"
+        "import os\n",
+        "os.environ[\"UV_SYSTEM_PYTHON\"] = \"1\"\n",
+        "!pip install uv\n",
+        "!uv pip install llama-stack"
      ]
    },
    {
@ -218,7 +114,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 2,
+      "execution_count": null,
      "id": "HaepEZXCDgif",
      "metadata": {
        "colab": {
@ -228,331 +124,9 @@
        "id": "HaepEZXCDgif",
        "outputId": "9314f698-593d-4c1a-ea15-15c735dc1023"
      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Requirement already satisfied: llama-stack in /usr/local/lib/python3.11/dist-packages (0.1.0rc10)\r\n",
-            "Requirement already satisfied: blobfile in /usr/local/lib/python3.11/dist-packages (from llama-stack) (3.0.0)\r\n",
-            "Requirement already satisfied: fire in /usr/local/lib/python3.11/dist-packages (from llama-stack) (0.7.0)\r\n",
-            "Requirement already satisfied: httpx in /usr/local/lib/python3.11/dist-packages (from llama-stack) (0.28.1)\r\n",
-            "Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.11/dist-packages (from llama-stack) (0.27.1)\r\n",
-            "Requirement already satisfied: llama-models==0.1.0rc10 in /usr/local/lib/python3.11/dist-packages (from llama-stack) (0.1.0rc10)\r\n",
-            "Requirement already satisfied: llama-stack-client==0.1.0rc10 in /usr/local/lib/python3.11/dist-packages (from llama-stack) (0.1.0rc10)\r\n",
-            "Requirement already satisfied: prompt-toolkit in /usr/local/lib/python3.11/dist-packages (from llama-stack) (3.0.48)\r\n",
-            "Requirement already satisfied: python-dotenv in /usr/local/lib/python3.11/dist-packages (from llama-stack) (1.0.1)\r\n",
-            "Requirement already satisfied: pydantic>=2 in /usr/local/lib/python3.11/dist-packages (from llama-stack) (2.10.5)\r\n",
-            "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from llama-stack) (2.32.3)\r\n",
-            "Requirement already satisfied: rich in /usr/local/lib/python3.11/dist-packages (from llama-stack) (13.9.4)\r\n",
-            "Requirement already satisfied: setuptools in /usr/local/lib/python3.11/dist-packages (from llama-stack) (75.1.0)\r\n",
-            "Requirement already satisfied: termcolor in /usr/local/lib/python3.11/dist-packages (from llama-stack) (2.5.0)\r\n",
-            "Requirement already satisfied: PyYAML in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack) (6.0.2)\r\n",
-            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack) (3.1.5)\r\n",
-            "Requirement already satisfied: tiktoken in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack) (0.8.0)\r\n",
-            "Requirement already satisfied: Pillow in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack) (11.1.0)\r\n",
-            "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (3.7.1)\r\n",
-            "Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (8.1.8)\r\n",
-            "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (1.9.0)\r\n",
-            "Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (2.2.2)\r\n",
-            "Requirement already satisfied: pyaml in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (25.1.0)\r\n",
-            "Requirement already satisfied: sniffio in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (1.3.1)\r\n",
-            "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (4.67.1)\r\n",
-            "Requirement already satisfied: typing-extensions<5,>=4.7 in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (4.12.2)\r\n",
-            "Requirement already satisfied: certifi in /usr/local/lib/python3.11/dist-packages (from httpx->llama-stack) (2024.12.14)\r\n",
-            "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.11/dist-packages (from httpx->llama-stack) (1.0.7)\r\n",
-            "Requirement already satisfied: idna in /usr/local/lib/python3.11/dist-packages (from httpx->llama-stack) (3.10)\r\n",
-            "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.11/dist-packages (from httpcore==1.*->httpx->llama-stack) (0.14.0)\r\n",
-            "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic>=2->llama-stack) (0.7.0)\r\n",
-            "Requirement already satisfied: pydantic-core==2.27.2 in /usr/local/lib/python3.11/dist-packages (from pydantic>=2->llama-stack) (2.27.2)\r\n",
-            "Requirement already satisfied: pycryptodomex>=3.8 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack) (3.21.0)\r\n",
-            "Requirement already satisfied: urllib3<3,>=1.25.3 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack) (2.3.0)\r\n",
-            "Requirement already satisfied: lxml>=4.9 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack) (5.3.0)\r\n",
-            "Requirement already satisfied: filelock>=3.0 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack) (3.16.1)\r\n",
-            "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub->llama-stack) (2024.10.0)\r\n",
-            "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub->llama-stack) (24.2)\r\n",
-            "Requirement already satisfied: wcwidth in /usr/local/lib/python3.11/dist-packages (from prompt-toolkit->llama-stack) (0.2.13)\r\n",
-            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->llama-stack) (3.4.1)\r\n",
-            "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich->llama-stack) (3.0.0)\r\n",
-            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich->llama-stack) (2.18.0)\n",
-            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich->llama-stack) (0.1.2)\n",
-            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->llama-models==0.1.0rc10->llama-stack) (3.0.2)\n",
-            "Requirement already satisfied: numpy>=1.23.2 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack) (1.26.4)\n",
-            "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack) (2.8.2)\n",
-            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack) (2024.2)\n",
-            "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack) (2024.2)\n",
-            "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.11/dist-packages (from tiktoken->llama-models==0.1.0rc10->llama-stack) (2024.11.6)\n",
-            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas->llama-stack-client==0.1.0rc10->llama-stack) (1.17.0)\n",
-            "Installing pip dependencies\n",
-            "Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (2.2.2)\n",
-            "Collecting together\n",
-            "  Downloading together-1.3.11-py3-none-any.whl.metadata (11 kB)\n",
-            "Collecting datasets\n",
-            "  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)\n",
-            "Requirement already satisfied: transformers in /usr/local/lib/python3.11/dist-packages (4.47.1)\n",
-            "Requirement already satisfied: blobfile in /usr/local/lib/python3.11/dist-packages (3.0.0)\n",
-            "Requirement already satisfied: opentelemetry-sdk in /usr/local/lib/python3.11/dist-packages (1.29.0)\n",
-            "Collecting redis\n",
-            "  Downloading redis-5.2.1-py3-none-any.whl.metadata (9.1 kB)\n",
-            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.11/dist-packages (3.10.0)\n",
-            "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (2.32.3)\n",
-            "Requirement already satisfied: chardet in /usr/local/lib/python3.11/dist-packages (5.2.0)\n",
-            "Collecting chromadb-client\n",
-            "  Downloading chromadb_client-0.6.3-py3-none-any.whl.metadata (2.4 kB)\n",
-            "Collecting psycopg2-binary\n",
-            "  Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)\n",
-            "Collecting mcp\n",
-            "  Downloading mcp-1.2.0-py3-none-any.whl.metadata (15 kB)\n",
-            "Requirement already satisfied: pillow in /usr/local/lib/python3.11/dist-packages (11.1.0)\n",
-            "Requirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (1.13.1)\n",
-            "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (4.67.1)\n",
-            "Requirement already satisfied: nltk in /usr/local/lib/python3.11/dist-packages (3.9.1)\n",
-            "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.11/dist-packages (0.2.0)\n",
-            "Collecting faiss-cpu\n",
-            "  Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)\n",
-            "Collecting opentelemetry-exporter-otlp-proto-http\n",
-            "  Downloading opentelemetry_exporter_otlp_proto_http-1.29.0-py3-none-any.whl.metadata (2.2 kB)\n",
-            "Collecting autoevals\n",
-            "  Downloading autoevals-0.0.117-py3-none-any.whl.metadata (12 kB)\n",
-            "Collecting pypdf\n",
-            "  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)\n",
-            "Collecting aiosqlite\n",
-            "  Downloading aiosqlite-0.20.0-py3-none-any.whl.metadata (4.3 kB)\n",
-            "Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (1.26.4)\n",
-            "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.11/dist-packages (1.6.0)\n",
-            "Requirement already satisfied: openai in /usr/local/lib/python3.11/dist-packages (1.59.6)\n",
-            "Collecting fastapi\n",
-            "  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)\n",
-            "Requirement already satisfied: fire in /usr/local/lib/python3.11/dist-packages (0.7.0)\n",
-            "Requirement already satisfied: httpx in /usr/local/lib/python3.11/dist-packages (0.28.1)\n",
-            "Collecting uvicorn\n",
-            "  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)\n",
-            "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas) (2.8.2)\n",
-            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas) (2024.2)\n",
-            "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas) (2024.2)\n",
-            "Requirement already satisfied: aiohttp<4.0.0,>=3.9.3 in /usr/local/lib/python3.11/dist-packages (from together) (3.11.11)\n",
-            "Requirement already satisfied: click<9.0.0,>=8.1.7 in /usr/local/lib/python3.11/dist-packages (from together) (8.1.8)\n",
-            "Requirement already satisfied: eval-type-backport<0.3.0,>=0.1.3 in /usr/local/lib/python3.11/dist-packages (from together) (0.2.2)\n",
-            "Requirement already satisfied: filelock<4.0.0,>=3.13.1 in /usr/local/lib/python3.11/dist-packages (from together) (3.16.1)\n",
-            "Collecting pillow\n",
-            "  Downloading pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.2 kB)\n",
-            "Requirement already satisfied: pyarrow>=10.0.1 in /usr/local/lib/python3.11/dist-packages (from together) (17.0.0)\n",
-            "Requirement already satisfied: pydantic<3.0.0,>=2.6.3 in /usr/local/lib/python3.11/dist-packages (from together) (2.10.5)\n",
-            "Requirement already satisfied: rich<14.0.0,>=13.8.1 in /usr/local/lib/python3.11/dist-packages (from together) (13.9.4)\n",
-            "Requirement already satisfied: tabulate<0.10.0,>=0.9.0 in /usr/local/lib/python3.11/dist-packages (from together) (0.9.0)\n",
-            "Requirement already satisfied: typer<0.16,>=0.9 in /usr/local/lib/python3.11/dist-packages (from together) (0.15.1)\n",
-            "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n",
-            "  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n",
-            "Collecting xxhash (from datasets)\n",
-            "  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
-            "Collecting multiprocess<0.70.17 (from datasets)\n",
-            "  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)\n",
-            "Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)\n",
-            "  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)\n",
-            "Requirement already satisfied: huggingface-hub>=0.23.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.27.1)\n",
-            "Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from datasets) (24.2)\n",
-            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from datasets) (6.0.2)\n",
-            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (2024.11.6)\n",
-            "Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.21.0)\n",
-            "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.5.2)\n",
-            "Requirement already satisfied: pycryptodomex>=3.8 in /usr/local/lib/python3.11/dist-packages (from blobfile) (3.21.0)\n",
-            "Requirement already satisfied: urllib3<3,>=1.25.3 in /usr/local/lib/python3.11/dist-packages (from blobfile) (2.3.0)\n",
-            "Requirement already satisfied: lxml>=4.9 in /usr/local/lib/python3.11/dist-packages (from blobfile) (5.3.0)\n",
-            "Requirement already satisfied: opentelemetry-api==1.29.0 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-sdk) (1.29.0)\n",
-            "Requirement already satisfied: opentelemetry-semantic-conventions==0.50b0 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-sdk) (0.50b0)\n",
-            "Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-sdk) (4.12.2)\n",
-            "Requirement already satisfied: deprecated>=1.2.6 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-api==1.29.0->opentelemetry-sdk) (1.2.15)\n",
-            "Requirement already satisfied: importlib-metadata<=8.5.0,>=6.0 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-api==1.29.0->opentelemetry-sdk) (8.5.0)\n",
-            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.3.1)\n",
-            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (0.12.1)\n",
-            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (4.55.3)\n",
-            "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.4.8)\n",
-            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (3.2.1)\n",
-            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests) (3.4.1)\n",
-            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests) (3.10)\n",
-            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests) (2024.12.14)\n",
-            "Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb-client)\n",
-            "  Downloading opentelemetry_exporter_otlp_proto_grpc-1.29.0-py3-none-any.whl.metadata (2.2 kB)\n",
-            "Collecting overrides>=7.3.1 (from chromadb-client)\n",
-            "  Downloading overrides-7.7.0-py3-none-any.whl.metadata (5.8 kB)\n",
-            "Collecting posthog>=2.4.0 (from chromadb-client)\n",
-            "  Downloading posthog-3.8.4-py2.py3-none-any.whl.metadata (2.8 kB)\n",
-            "Requirement already satisfied: tenacity>=8.2.3 in /usr/local/lib/python3.11/dist-packages (from chromadb-client) (9.0.0)\n",
-            "Requirement already satisfied: orjson>=3.9.12 in /usr/local/lib/python3.11/dist-packages (from chromadb-client) (3.10.14)\n",
-            "Collecting anyio>=4.5 (from mcp)\n",
-            "  Downloading anyio-4.8.0-py3-none-any.whl.metadata (4.6 kB)\n",
-            "Collecting httpx-sse>=0.4 (from mcp)\n",
-            "  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)\n",
-            "Collecting pydantic-settings>=2.6.1 (from mcp)\n",
-            "  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)\n",
-            "Collecting sse-starlette>=1.6.1 (from mcp)\n",
-            "  Downloading sse_starlette-2.2.1-py3-none-any.whl.metadata (7.8 kB)\n",
-            "Collecting starlette>=0.27 (from mcp)\n",
-            "  Downloading starlette-0.45.2-py3-none-any.whl.metadata (6.3 kB)\n",
-            "Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk) (1.4.2)\n",
-            "Requirement already satisfied: googleapis-common-protos~=1.52 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-exporter-otlp-proto-http) (1.66.0)\n",
-            "Collecting opentelemetry-exporter-otlp-proto-common==1.29.0 (from opentelemetry-exporter-otlp-proto-http)\n",
-            "  Downloading opentelemetry_exporter_otlp_proto_common-1.29.0-py3-none-any.whl.metadata (1.8 kB)\n",
-            "Collecting opentelemetry-proto==1.29.0 (from opentelemetry-exporter-otlp-proto-http)\n",
-            "  Downloading opentelemetry_proto-1.29.0-py3-none-any.whl.metadata (2.3 kB)\n",
-            "Collecting protobuf<6.0,>=5.0 (from opentelemetry-proto==1.29.0->opentelemetry-exporter-otlp-proto-http)\n",
-            "  Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)\n",
-            "Collecting chevron (from autoevals)\n",
-            "  Downloading chevron-0.14.0-py3-none-any.whl.metadata (4.9 kB)\n",
-            "Collecting levenshtein (from autoevals)\n",
-            "  Downloading levenshtein-0.26.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)\n",
-            "Collecting braintrust_core==0.0.58 (from autoevals)\n",
-            "  Downloading braintrust_core-0.0.58-py3-none-any.whl.metadata (669 bytes)\n",
-            "Requirement already satisfied: jsonschema in /usr/local/lib/python3.11/dist-packages (from autoevals) (4.23.0)\n",
-            "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (3.5.0)\n",
-            "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.11/dist-packages (from openai) (1.9.0)\n",
-            "Requirement already satisfied: jiter<1,>=0.4.0 in /usr/local/lib/python3.11/dist-packages (from openai) (0.8.2)\n",
-            "Requirement already satisfied: sniffio in /usr/local/lib/python3.11/dist-packages (from openai) (1.3.1)\n",
-            "Collecting starlette>=0.27 (from mcp)\n",
-            "  Downloading starlette-0.41.3-py3-none-any.whl.metadata (6.0 kB)\n",
-            "Requirement already satisfied: termcolor in /usr/local/lib/python3.11/dist-packages (from fire) (2.5.0)\n",
-            "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.11/dist-packages (from httpx) (1.0.7)\n",
-            "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.11/dist-packages (from httpcore==1.*->httpx) (0.14.0)\n",
-            "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (2.4.4)\n",
-            "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (1.3.2)\n",
-            "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (24.3.0)\n",
-            "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (1.5.0)\n",
-            "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (6.1.0)\n",
-            "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (0.2.1)\n",
-            "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (1.18.3)\n",
-            "Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.11/dist-packages (from deprecated>=1.2.6->opentelemetry-api==1.29.0->opentelemetry-sdk) (1.17.0)\n",
-            "Requirement already satisfied: grpcio<2.0.0,>=1.63.2 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb-client) (1.69.0)\n",
-            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from posthog>=2.4.0->chromadb-client) (1.17.0)\n",
-            "Collecting monotonic>=1.5 (from posthog>=2.4.0->chromadb-client)\n",
-            "  Downloading monotonic-1.6-py2.py3-none-any.whl.metadata (1.5 kB)\n",
-            "Collecting backoff>=1.10.0 (from posthog>=2.4.0->chromadb-client)\n",
-            "  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)\n",
-            "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic<3.0.0,>=2.6.3->together) (0.7.0)\n",
-            "Requirement already satisfied: pydantic-core==2.27.2 in /usr/local/lib/python3.11/dist-packages (from pydantic<3.0.0,>=2.6.3->together) (2.27.2)\n",
-            "Requirement already satisfied: python-dotenv>=0.21.0 in /usr/local/lib/python3.11/dist-packages (from pydantic-settings>=2.6.1->mcp) (1.0.1)\n",
-            "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich<14.0.0,>=13.8.1->together) (3.0.0)\n",
-            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich<14.0.0,>=13.8.1->together) (2.18.0)\n",
-            "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.11/dist-packages (from typer<0.16,>=0.9->together) (1.5.4)\n",
-            "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.11/dist-packages (from jsonschema->autoevals) (2024.10.1)\n",
-            "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.11/dist-packages (from jsonschema->autoevals) (0.35.1)\n",
-            "Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.11/dist-packages (from jsonschema->autoevals) (0.22.3)\n",
-            "Collecting rapidfuzz<4.0.0,>=3.9.0 (from levenshtein->autoevals)\n",
-            "  Downloading rapidfuzz-3.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n",
-            "Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.11/dist-packages (from importlib-metadata<=8.5.0,>=6.0->opentelemetry-api==1.29.0->opentelemetry-sdk) (3.21.0)\n",
-            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich<14.0.0,>=13.8.1->together) (0.1.2)\n",
-            "Downloading together-1.3.11-py3-none-any.whl (70 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m70.6/70.6 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading datasets-3.2.0-py3-none-any.whl (480 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading redis-5.2.1-py3-none-any.whl (261 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.5/261.5 kB\u001b[0m \u001b[31m25.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading chromadb_client-0.6.3-py3-none-any.whl (609 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m609.2/609.2 kB\u001b[0m \u001b[31m38.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m100.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading mcp-1.2.0-py3-none-any.whl (66 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.5/66.5 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl (4.5 MB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m106.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m27.5/27.5 MB\u001b[0m \u001b[31m78.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading opentelemetry_exporter_otlp_proto_http-1.29.0-py3-none-any.whl (17 kB)\n",
-            "Downloading opentelemetry_exporter_otlp_proto_common-1.29.0-py3-none-any.whl (18 kB)\n",
-            "Downloading opentelemetry_proto-1.29.0-py3-none-any.whl (55 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.8/55.8 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading autoevals-0.0.117-py3-none-any.whl (41 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.4/41.4 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading braintrust_core-0.0.58-py3-none-any.whl (4.4 kB)\n",
-            "Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m298.0/298.0 kB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading aiosqlite-0.20.0-py3-none-any.whl (15 kB)\n",
-            "Downloading fastapi-0.115.6-py3-none-any.whl (94 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.8/94.8 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading uvicorn-0.34.0-py3-none-any.whl (62 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.3/62.3 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading anyio-4.8.0-py3-none-any.whl (96 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m96.0/96.0 kB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m12.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m17.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)\n",
-            "Downloading multiprocess-0.70.16-py311-none-any.whl (143 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.5/143.5 kB\u001b[0m \u001b[31m14.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading opentelemetry_exporter_otlp_proto_grpc-1.29.0-py3-none-any.whl (18 kB)\n",
-            "Downloading overrides-7.7.0-py3-none-any.whl (17 kB)\n",
-            "Downloading posthog-3.8.4-py2.py3-none-any.whl (69 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m69.8/69.8 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading pydantic_settings-2.7.1-py3-none-any.whl (29 kB)\n",
-            "Downloading sse_starlette-2.2.1-py3-none-any.whl (10 kB)\n",
-            "Downloading starlette-0.41.3-py3-none-any.whl (73 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.2/73.2 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading chevron-0.14.0-py3-none-any.whl (11 kB)\n",
-            "Downloading levenshtein-0.26.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m162.7/162.7 kB\u001b[0m \u001b[31m17.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.8/194.8 kB\u001b[0m \u001b[31m21.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading backoff-2.2.1-py3-none-any.whl (15 kB)\n",
-            "Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n",
-            "Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl (319 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m319.7/319.7 kB\u001b[0m \u001b[31m28.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading rapidfuzz-3.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m84.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hInstalling collected packages: monotonic, chevron, xxhash, uvicorn, redis, rapidfuzz, pypdf, psycopg2-binary, protobuf, pillow, overrides, httpx-sse, fsspec, faiss-cpu, dill, braintrust_core, backoff, anyio, aiosqlite, starlette, posthog, opentelemetry-proto, multiprocess, levenshtein, sse-starlette, pydantic-settings, opentelemetry-exporter-otlp-proto-common, fastapi, together, mcp, datasets, autoevals, opentelemetry-exporter-otlp-proto-http, opentelemetry-exporter-otlp-proto-grpc, chromadb-client\n",
-            "  Attempting uninstall: protobuf\n",
-            "    Found existing installation: protobuf 4.25.5\n",
-            "    Uninstalling protobuf-4.25.5:\n",
-            "      Successfully uninstalled protobuf-4.25.5\n",
-            "  Attempting uninstall: pillow\n",
-            "    Found existing installation: pillow 11.1.0\n",
-            "    Uninstalling pillow-11.1.0:\n",
-            "      Successfully uninstalled pillow-11.1.0\n",
-            "  Attempting uninstall: fsspec\n",
-            "    Found existing installation: fsspec 2024.10.0\n",
-            "    Uninstalling fsspec-2024.10.0:\n",
-            "      Successfully uninstalled fsspec-2024.10.0\n",
-            "  Attempting uninstall: anyio\n",
-            "    Found existing installation: anyio 3.7.1\n",
-            "    Uninstalling anyio-3.7.1:\n",
-            "      Successfully uninstalled anyio-3.7.1\n",
-            "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
-            "jupyter-server 1.24.0 requires anyio<4,>=3.1.0, but you have anyio 4.8.0 which is incompatible.\n",
-            "gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\n",
-            "tensorflow 2.17.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 5.29.3 which is incompatible.\u001b[0m\u001b[31m\n",
-            "\u001b[0mSuccessfully installed aiosqlite-0.20.0 anyio-4.8.0 autoevals-0.0.117 backoff-2.2.1 braintrust_core-0.0.58 chevron-0.14.0 chromadb-client-0.6.3 datasets-3.2.0 dill-0.3.8 faiss-cpu-1.9.0.post1 fastapi-0.115.6 fsspec-2024.9.0 httpx-sse-0.4.0 levenshtein-0.26.1 mcp-1.2.0 monotonic-1.6 multiprocess-0.70.16 opentelemetry-exporter-otlp-proto-common-1.29.0 opentelemetry-exporter-otlp-proto-grpc-1.29.0 opentelemetry-exporter-otlp-proto-http-1.29.0 opentelemetry-proto-1.29.0 overrides-7.7.0 pillow-10.4.0 posthog-3.8.4 protobuf-5.29.3 psycopg2-binary-2.9.10 pydantic-settings-2.7.1 pypdf-5.1.0 rapidfuzz-3.11.0 redis-5.2.1 sse-starlette-2.2.1 starlette-0.41.3 together-1.3.11 uvicorn-0.34.0 xxhash-3.5.0\n",
-            "torch --index-url https://download.pytorch.org/whl/cpu\n",
-            "Looking in indexes: https://download.pytorch.org/whl/cpu\n",
-            "Requirement already satisfied: torch in /usr/local/lib/python3.11/dist-packages (2.5.1+cu121)\n",
-            "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from torch) (3.16.1)\n",
-            "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.11/dist-packages (from torch) (4.12.2)\n",
-            "Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch) (3.4.2)\n",
-            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch) (3.1.5)\n",
-            "Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from torch) (2024.9.0)\n",
-            "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.11/dist-packages (from torch) (12.1.105)\n",
-            "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.11/dist-packages (from torch) (12.1.105)\n",
-            "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.11/dist-packages (from torch) (12.1.105)\n",
-            "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch) (9.1.0.70)\n",
-            "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.11/dist-packages (from torch) (12.1.3.1)\n",
-            "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.11/dist-packages (from torch) (11.0.2.54)\n",
-            "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.11/dist-packages (from torch) (10.3.2.106)\n",
-            "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.11/dist-packages (from torch) (11.4.5.107)\n",
-            "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.11/dist-packages (from torch) (12.1.0.106)\n",
-            "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch) (2.21.5)\n",
-            "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.11/dist-packages (from torch) (12.1.105)\n",
-            "Requirement already satisfied: triton==3.1.0 in /usr/local/lib/python3.11/dist-packages (from torch) (3.1.0)\n",
-            "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch) (1.13.1)\n",
-            "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.11/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch) (12.6.85)\n",
-            "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch) (1.3.0)\n",
-            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch) (3.0.2)\n",
-            "sentence-transformers --no-deps\n",
-            "Requirement already satisfied: sentence-transformers in /usr/local/lib/python3.11/dist-packages (3.3.1)\n",
-            "\u001b[32mBuild Successful!\u001b[0m\n"
-          ]
-        }
-      ],
+      "outputs": [],
      "source": [
        "# NBVAL_SKIP\n",
-        "\n",
        "# This will build all the dependencies you will need\n",
        "!llama stack build --template together --image-type venv"
      ]
@ -571,7 +145,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 2,
+      "execution_count": 4,
      "id": "E1UFuJC570Tk",
      "metadata": {
        "colab": {
@ -1125,11 +699,8 @@
        "        if not api_key:\n",
        "            raise ValueError(f\"{key} environment variable is empty\")\n",
        "    except KeyError:\n",
-        "        raise KeyError(\n",
-        "            f\"{key} environment variable is not set. \"\n",
-        "            \"Please set your API key using in userdata (if using google colab notebook)\"\n",
-        "            f\"or using `export {key}='your-api-key-here'`\"\n",
-        "        ) from None\n",
+        "        api_key = input(f\"{key} environment variable is not set. Please enter your API key: \")\n",
+        "        os.environ[key] = api_key\n",
        "\n",
        "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
        "client = LlamaStackAsLibraryClient(\"together\", provider_data = {\"tavily_search_api_key\": os.environ['TAVILY_SEARCH_API_KEY']})\n",
@ -1150,7 +721,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 3,
+      "execution_count": 5,
      "id": "ruO9jQna_t_S",
      "metadata": {
        "colab": {
@ -1211,7 +782,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 4,
+      "execution_count": 6,
      "id": "LINBvv8lwTJh",
      "metadata": {
        "colab": {
@ -1228,7 +799,7 @@
              "'meta-llama/Llama-3.1-70B-Instruct'"
            ]
          },
-          "execution_count": 4,
+          "execution_count": 6,
          "metadata": {},
          "output_type": "execute_result"
        }
@ -1253,7 +824,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 5,
+      "execution_count": 7,
      "id": "77c29dba",
      "metadata": {
        "colab": {
@ -1267,7 +838,7 @@
          "name": "stdout",
          "output_type": "stream",
          "text": [
-            "Here's a two-sentence poem about a llama:\n",
+            "Here is a two-sentence poem about a llama:\n",
            "\n",
            "With gentle eyes and a soft, fuzzy face,\n",
            "The llama roams, a peaceful, gentle pace.\n"
@ -2084,13 +1655,14 @@
        }
      ],
      "source": [
+        "import uuid\n",
        "from llama_stack_client.lib.agents.agent import Agent\n",
        "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
        "from llama_stack_client.types.agent_create_params import AgentConfig\n",
        "from termcolor import cprint\n",
        "from llama_stack_client.types import Document\n",
        "\n",
-        "urls = [\"chat.rst\", \"llama3.rst\", \"datasets.rst\", \"lora_finetune.rst\"]\n",
+        "urls = [\"chat.rst\", \"llama3.rst\", \"memory_optimizations.rst\", \"lora_finetune.rst\"]\n",
        "documents = [\n",
        "    Document(\n",
        "        document_id=f\"num-{i}\",\n",
@ -2101,7 +1673,7 @@
        "    for i, url in enumerate(urls)\n",
        "]\n",
        "\n",
-        "vector_db_id = \"test-vector-db\"\n",
+        "vector_db_id = f\"test-vector-db-{uuid.uuid4().hex}\"\n",
        "client.vector_dbs.register(\n",
        "    vector_db_id=vector_db_id,\n",
        "    embedding_model=\"all-MiniLM-L6-v2\",\n",
@ -2398,6 +1970,7 @@
        }
      ],
      "source": [
+        "# NBVAL_SKIP\n",
        "!pip install colab-xterm #https://pypi.org/project/colab-xterm/\n",
        "%load_ext colabxterm"
      ]
@ -2774,7 +2347,7 @@
        }
      ],
      "source": [
-        "\n",
+        "# NBVAL_SKIP\n",
        "%xterm\n",
        "# touch /content/foo\n",
        "# touch /content/bar\n",
@ -2800,6 +2373,7 @@
      },
      "outputs": [],
      "source": [
+        "# NBVAL_SKIP\n",
        "from llama_stack_client.types.shared_params.url import URL\n",
        "client.toolgroups.register(\n",
        "    toolgroup_id=\"mcp::filesystem\",\n",
@ -3170,6 +2744,7 @@
        }
      ],
      "source": [
+        "# NBVAL_SKIP\n",
        "from llama_stack_client.lib.agents.agent import Agent\n",
        "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
        "from llama_stack_client.types.agent_create_params import AgentConfig\n",
@ -3523,7 +3098,7 @@
        }
      ],
      "source": [
-        "# NBVAL_SKIP    \n",
+        "# NBVAL_SKIP\n",
        "print(f\"Getting traces for session_id={session_id}\")\n",
        "import json\n",
        "\n",
@ -3821,6 +3396,231 @@
        "response = client.scoring.score(input_rows=rows, scoring_functions=scoring_params)\n",
        "pprint(response)\n"
      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "ad077440",
+      "metadata": {},
+      "source": [
+        "## 4. Image Understanding with Llama 3.2\n",
+        "\n",
+        "Below is a complete example of using Together's Llama Stack 0.1 server at https://llama-stack.together.ai to ask Llama 3.2 questions about an image."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "82e381ec",
+      "metadata": {},
+      "source": [
+        "### 4.1 Setup and helpers\n",
+        "\n",
+        "Below we install the Llama Stack client 0.1, download the example image, define two image helpers, and set Llama Stack Together server URL and Llama 3.2 model name.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "865fc5a8",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!pip install llama-stack-client==0.1.0"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "44e05e16",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!wget https://raw.githubusercontent.com/meta-llama/llama-models/refs/heads/main/Llama_Repo.jpeg"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "469750f7",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from PIL import Image\n",
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "def display_image(path):\n",
+        "  img = Image.open(path)\n",
+        "  plt.imshow(img)\n",
+        "  plt.axis('off')\n",
+        "  plt.show()\n",
+        "\n",
+        "display_image(\"Llama_Repo.jpeg\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "a2c1e1c2",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import base64\n",
+        "\n",
+        "def encode_image(image_path):\n",
+        "    with open(image_path, \"rb\") as image_file:\n",
+        "        base64_string = base64.b64encode(image_file.read()).decode(\"utf-8\")\n",
+        "        base64_url = f\"data:image/png;base64,{base64_string}\"\n",
+        "        return base64_url"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "c565f99e",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from llama_stack_client import LlamaStackClient\n",
+        "\n",
+        "LLAMA_STACK_API_TOGETHER_URL=\"https://llama-stack.together.ai\"\n",
+        "LLAMA32_11B_INSTRUCT = \"meta-llama/Llama-3.2-11B-Vision-Instruct\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "7737cd41",
+      "metadata": {},
+      "source": [
+        "### 4.2 Using Llama Stack Chat API\n",
+        "\n",
+        "The code below uses the Llama Stack 0.1's chat API to interact with Llama 3.2:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "d7914894",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from llama_stack_client.lib.inference.event_logger import EventLogger\n",
+        "\n",
+        "async def run_main(image_path: str, prompt):\n",
+        "    client = LlamaStackClient(\n",
+        "        base_url=LLAMA_STACK_API_TOGETHER_URL,\n",
+        "    )\n",
+        "\n",
+        "    message = {\n",
+        "        \"role\": \"user\",\n",
+        "        \"content\": [\n",
+        "            {\n",
+        "                \"type\": \"image\",\n",
+        "                \"image\": {\n",
+        "                     \"url\": {\n",
+        "                          \"uri\": encode_image(image_path)\n",
+        "                     }\n",
+        "                }\n",
+        "            },\n",
+        "            {\n",
+        "                \"type\": \"text\",\n",
+        "                \"text\": prompt,\n",
+        "            }\n",
+        "        ]\n",
+        "    }\n",
+        "\n",
+        "    response = client.inference.chat_completion(\n",
+        "        messages=[message],\n",
+        "        model_id=LLAMA32_11B_INSTRUCT,\n",
+        "        stream=False,\n",
+        "    )\n",
+        "\n",
+        "    print(response.completion_message.content.lower().strip())"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "4ee09b97",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "await run_main(\"Llama_Repo.jpeg\",\n",
+        "     \"How many different colors are those llamas?\\\n",
+        "     What are those colors?\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "e741d7b9",
+      "metadata": {},
+      "source": [
+        "### 4.3 Using Llama Stack Agent API\n",
+        "\n",
+        "The code below uses the Llama Stack 0.1's Agent API to interact with Llama 3.2:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "f9a83275",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from llama_stack_client.lib.agents.agent import Agent\n",
+        "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
+        "from llama_stack_client.types.agent_create_params import AgentConfig\n",
+        "\n",
+        "async def run_main(image_path, prompt):\n",
+        "    base64_image = encode_image(image_path)\n",
+        "\n",
+        "    client = LlamaStackClient(\n",
+        "        base_url=LLAMA_STACK_API_TOGETHER_URL,\n",
+        "    )\n",
+        "\n",
+        "    agent_config = AgentConfig(\n",
+        "        model=LLAMA32_11B_INSTRUCT,\n",
+        "        instructions=\"You are a helpful assistant\",\n",
+        "        enable_session_persistence=False,\n",
+        "    )\n",
+        "\n",
+        "    agent = Agent(client, agent_config)\n",
+        "    session_id = agent.create_session(\"test-session\")\n",
+        "\n",
+        "    response = agent.create_turn(\n",
+        "        messages=[{\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": [\n",
+        "                {\n",
+        "                    \"type\": \"image\",\n",
+        "                    \"image\": {\n",
+        "                         \"url\": {\n",
+        "                              \"uri\": encode_image(image_path)\n",
+        "                         }\n",
+        "                    }\n",
+        "                },\n",
+        "                {\n",
+        "                    \"type\": \"text\",\n",
+        "                    \"text\": prompt,\n",
+        "                }\n",
+        "            ]\n",
+        "        }],\n",
+        "        session_id=session_id,\n",
+        "    )\n",
+        "\n",
+        "    for log in EventLogger().log(response):\n",
+        "        log.print()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "15d0098b",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "await run_main(\"Llama_Repo.jpeg\",\n",
+        "         \"How many different colors are those llamas?\\\n",
+        "         What are those colors?\")"
+      ]
    }
  ],
  "metadata": {
@ -3830,7 +3630,8 @@
      "provenance": []
    },
    "kernelspec": {
-      "display_name": "Python 3",
+      "display_name": "toolchain",
+      "language": "python",
      "name": "python3"
    },
    "language_info": {
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@ -6,7 +6,7 @@
        "id": "hTIfyoGtjoWD"
      },
      "source": [
-        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1UvR9m2KTinvlDXeOWfS2HBU4X72LAjTz?usp=sharing)\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb)\n",
        "\n",
        "# Llama Stack Benchmark Evals\n",
        "\n",
@ -1383,7 +1383,8 @@
      "provenance": []
    },
    "kernelspec": {
-      "display_name": "Python 3",
+      "display_name": "master",
+      "language": "python",
      "name": "python3"
    },
    "language_info": {
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@ -36,6 +36,16 @@ from .pyopenapi.specification import Info, Server  # noqa: E402
 from .pyopenapi.utility import Specification  # noqa: E402


+def str_presenter(dumper, data):
+    if data.startswith(f"/{LLAMA_STACK_API_VERSION}") or data.startswith(
+        "#/components/schemas/"
+    ):
+        style = None
+    else:
+        style = ">" if "\n" in data or len(data) > 40 else None
+    return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=style)
+
+
 def main(output_dir: str):
    output_dir = Path(output_dir)
    if not output_dir.exists():
@ -69,7 +79,8 @@ def main(output_dir: str):
        y.sequence_dash_offset = 2
        y.width = 80
        y.allow_unicode = True
-        y.explicit_start = True
+        y.representer.add_representer(str, str_presenter)
+
        y.dump(
            spec.get_json(),
            fp,
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-import collections
 import hashlib
 import ipaddress
 import typing
+from dataclasses import make_dataclass
 from typing import Any, Dict, Set, Union

 from ..strong_typing.core import JsonType
@ -177,20 +177,37 @@ class ContentBuilder:
    ) -> Dict[str, MediaType]:
        "Creates the content subtree for a request or response."

-        def has_iterator_type(t):
-            if typing.get_origin(t) is typing.Union:
-                return any(has_iterator_type(a) for a in typing.get_args(t))
+        def is_iterator_type(t):
+            return "StreamChunk" in str(t)
+
+        def get_media_type(t):
+            if is_generic_list(t):
+                return "application/jsonl"
+            elif is_iterator_type(t):
+                return "text/event-stream"
            else:
-                # TODO: needs a proper fix where we let all types correctly flow upwards
-                # and then test against AsyncIterator
-                return "StreamChunk" in str(t)
+                return "application/json"
+
+        if typing.get_origin(payload_type) is typing.Union:
+            media_types = []
+            item_types = []
+            for x in typing.get_args(payload_type):
+                media_types.append(get_media_type(x))
+                item_types.append(x)
+
+            if len(set(media_types)) == 1:
+                # all types have the same media type
+                return {media_types[0]: self.build_media_type(payload_type, examples)}
+            else:
+                # different types have different media types
+                return {
+                    media_type: self.build_media_type(item_type, examples)
+                    for media_type, item_type in zip(media_types, item_types)
+                }

        if is_generic_list(payload_type):
            media_type = "application/jsonl"
            item_type = unwrap_generic_list(payload_type)
-        elif has_iterator_type(payload_type):
-            item_type = payload_type
-            media_type = "text/event-stream"
        else:
            media_type = "application/json"
            item_type = payload_type
@ -233,7 +250,9 @@ class ContentBuilder:
            value = sample_transformer(object_to_json(example))

            hash_string = (
-                hashlib.md5(json_dump_string(value).encode("utf-8")).digest().hex()
+                hashlib.sha256(json_dump_string(value).encode("utf-8"))
+                .digest()
+                .hex()[:16]
            )
            name = f"ex-{hash_string}"

@ -276,6 +295,20 @@ class StatusResponse:
    examples: List[Any] = dataclasses.field(default_factory=list)


+def create_docstring_for_request(
+    request_name: str, fields: List[Tuple[str, type, Any]], doc_params: Dict[str, str]
+) -> str:
+    """Creates a ReST-style docstring for a dynamically generated request dataclass."""
+    lines = ["\n"]  # Short description
+
+    # Add parameter documentation in ReST format
+    for name, type_ in fields:
+        desc = doc_params.get(name, "")
+        lines.append(f":param {name}: {desc}")
+
+    return "\n".join(lines)
+
+
 class ResponseBuilder:
    content_builder: ContentBuilder

@ -493,11 +526,24 @@ class Generator:
            first = next(iter(op.request_params))
            request_name, request_type = first

-            from dataclasses import make_dataclass
-
            op_name = "".join(word.capitalize() for word in op.name.split("_"))
            request_name = f"{op_name}Request"
-            request_type = make_dataclass(request_name, op.request_params)
+            fields = [
+                (
+                    name,
+                    type_,
+                )
+                for name, type_ in op.request_params
+            ]
+            request_type = make_dataclass(
+                request_name,
+                fields,
+                namespace={
+                    "__doc__": create_docstring_for_request(
+                        request_name, fields, doc_params
+                    )
+                },
+            )

            requestBody = RequestBody(
                content={
@ -598,10 +644,14 @@ class Generator:
        else:
            callbacks = None

+        description = "\n".join(
+            filter(None, [doc_string.short_description, doc_string.long_description])
+        )
        return Operation(
            tags=[op.defining_class.__name__],
-            summary=doc_string.short_description,
-            description=doc_string.long_description,
+            summary=None,
+            # summary=doc_string.short_description,
+            description=description,
            parameters=parameters,
            requestBody=requestBody,
            responses=responses,
@ -633,6 +683,7 @@ class Generator:
                raise NotImplementedError(f"unknown HTTP method: {op.http_method}")

            route = op.get_route()
+            route = route.replace(":path", "")
            print(f"route: {route}")
            if route in paths:
                paths[route].update(pathItem)
@ -650,12 +701,6 @@ class Generator:
                )
            )

-        # types that are produced/consumed by operations
-        type_tags = [
-            self._build_type_tag(ref, schema)
-            for ref, schema in self.schema_builder.schemas.items()
-        ]
-
        # types that are emitted by events
        event_tags: List[Tag] = []
        events = get_endpoint_events(self.endpoint)
@ -682,7 +727,6 @@ class Generator:
        # list all operations and types
        tags: List[Tag] = []
        tags.extend(operation_tags)
-        tags.extend(type_tags)
        tags.extend(event_tags)
        for extra_tag_group in extra_tag_groups.values():
            tags.extend(extra_tag_group)
@ -697,13 +741,6 @@ class Generator:
                    tags=sorted(tag.name for tag in operation_tags),
                )
            )
-        if type_tags:
-            tag_groups.append(
-                TagGroup(
-                    name=self.options.map("Types"),
-                    tags=sorted(tag.name for tag in type_tags),
-                )
-            )
        if event_tags:
            tag_groups.append(
                TagGroup(
--- a/docs/openapi_generator/pyopenapi/operations.py
+++ b/docs/openapi_generator/pyopenapi/operations.py
@ -130,6 +130,8 @@ class _FormatParameterExtractor:

 def _get_route_parameters(route: str) -> List[str]:
    extractor = _FormatParameterExtractor()
+    # Replace all occurrences of ":path" with empty string
+    route = route.replace(":path", "")
    route.format_map(extractor)
    return extractor.keys

--- a/docs/openapi_generator/pyopenapi/template.html
+++ b/docs/openapi_generator/pyopenapi/template.html
@ -6,36 +6,36 @@
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>OpenAPI specification</title>
    <link href="https://fonts.googleapis.com/css?family=Montserrat:300,400,700|Roboto:300,400,700" rel="stylesheet">
+    <script type="module" src="https://unpkg.com/@stoplight/elements/web-components.min.js"></script>
+    <link rel="stylesheet" href="https://unpkg.com/@stoplight/elements/styles.min.css">
    <style>
        body {
            margin: 0;
            padding: 0;
+            height: 100vh;
        }

+        elements-api {
+            height: 100%;
+        }
    </style>
-    <script defer="defer" src="https://cdn.redoc.ly/redoc/latest/bundles/redoc.standalone.js"></script>
-    <script defer="defer">
+</head>
+
+<body>
+    <elements-api id="openapi-container" router="hash" layout="sidebar" hideExport="true"
+        hideInternal="true"></elements-api>
+
+    <script>
        document.addEventListener("DOMContentLoaded", function () {
-            spec = { /* OPENAPI_SPECIFICATION */ };
-            options = {
-                downloadFileName: "openapi.json",
-                expandResponses: "200",
-                expandSingleSchemaField: true,
-                jsonSampleExpandLevel: "all",
-                schemaExpansionLevel: "all",
-            };
-            element = document.getElementById("openapi-container");
-            Redoc.init(spec, options, element);
+            const spec = { /* OPENAPI_SPECIFICATION */ };
+            const element = document.getElementById("openapi-container");
+            element.apiDescriptionDocument = spec;

            if (spec.info && spec.info.title) {
                document.title = spec.info.title;
            }
        });
    </script>
-</head>
-
-<body>
-    <div id="openapi-container"></div>
 </body>

 </html>
--- a/docs/openapi_generator/run_openapi_generator.sh
+++ b/docs/openapi_generator/run_openapi_generator.sh
@ -29,4 +29,5 @@ fi

 stack_dir=$(dirname $(dirname $THIS_DIR))
 models_dir=$(dirname $stack_dir)/llama-models
-PYTHONPATH=$PYTHONPATH:$stack_dir:$models_dir python -m docs.openapi_generator.generate $(dirname $THIS_DIR)/resources
+PYTHONPATH=$PYTHONPATH:$stack_dir:$models_dir \
+  python -m docs.openapi_generator.generate $(dirname $THIS_DIR)/_static
--- a/docs/openapi_generator/strong_typing/schema.py
+++ b/docs/openapi_generator/strong_typing/schema.py
@ -109,10 +109,10 @@ def get_class_property_docstrings(
 def docstring_to_schema(data_type: type) -> Schema:
    short_description, long_description = get_class_docstrings(data_type)
    schema: Schema = {}
-    if short_description:
-        schema["title"] = short_description
-    if long_description:
-        schema["description"] = long_description
+
+    description = "\n".join(filter(None, [short_description, long_description]))
+    if description:
+        schema["description"] = description
    return schema


@ -248,7 +248,9 @@ class JsonSchemaGenerator:
                type_schema.update(self._metadata_to_schema(m))
        return type_schema

-    def _simple_type_to_schema(self, typ: TypeLike) -> Optional[Schema]:
+    def _simple_type_to_schema(
+        self, typ: TypeLike, json_schema_extra: Optional[dict] = None
+    ) -> Optional[Schema]:
        """
        Returns the JSON schema associated with a simple, unrestricted type.

@ -264,6 +266,11 @@ class JsonSchemaGenerator:
        elif typ is float:
            return {"type": "number"}
        elif typ is str:
+            if json_schema_extra and "contentEncoding" in json_schema_extra:
+                return {
+                    "type": "string",
+                    "contentEncoding": json_schema_extra["contentEncoding"],
+                }
            return {"type": "string"}
        elif typ is bytes:
            return {"type": "string", "contentEncoding": "base64"}
@ -303,7 +310,12 @@ class JsonSchemaGenerator:
            # not a simple type
            return None

-    def type_to_schema(self, data_type: TypeLike, force_expand: bool = False) -> Schema:
+    def type_to_schema(
+        self,
+        data_type: TypeLike,
+        force_expand: bool = False,
+        json_schema_extra: Optional[dict] = None,
+    ) -> Schema:
        """
        Returns the JSON schema associated with a type.

@ -313,7 +325,7 @@ class JsonSchemaGenerator:
        """

        # short-circuit for common simple types
-        schema = self._simple_type_to_schema(data_type)
+        schema = self._simple_type_to_schema(data_type, json_schema_extra)
        if schema is not None:
            return schema

@ -486,15 +498,9 @@ class JsonSchemaGenerator:
        property_docstrings = get_class_property_docstrings(
            typ, self.options.property_description_fun
        )
-
        properties: Dict[str, Schema] = {}
        required: List[str] = []
        for property_name, property_type in get_class_properties(typ):
-            defaults = {}
-            if "model_fields" in members:
-                f = members["model_fields"]
-                defaults = {k: finfo.default for k, finfo in f.items()}
-
            # rename property if an alias name is specified
            alias = get_annotation(property_type, Alias)
            if alias:
@ -502,11 +508,22 @@ class JsonSchemaGenerator:
            else:
                output_name = property_name

+            defaults = {}
+            json_schema_extra = None
+            if "model_fields" in members:
+                f = members["model_fields"]
+                defaults = {k: finfo.default for k, finfo in f.items()}
+                json_schema_extra = f.get(output_name, None).json_schema_extra
+
            if is_type_optional(property_type):
                optional_type: type = unwrap_optional_type(property_type)
-                property_def = self.type_to_schema(optional_type)
+                property_def = self.type_to_schema(
+                    optional_type, json_schema_extra=json_schema_extra
+                )
            else:
-                property_def = self.type_to_schema(property_type)
+                property_def = self.type_to_schema(
+                    property_type, json_schema_extra=json_schema_extra
+                )
                required.append(output_name)

            # check if attribute has a default value initializer
@ -531,6 +548,7 @@ class JsonSchemaGenerator:
            # add property docstring if available
            property_doc = property_docstrings.get(property_name)
            if property_doc:
+                # print(output_name, property_doc)
                property_def.pop("title", None)
                property_def["description"] = property_doc

--- a/docs/readme.md
+++ b/docs/readme.md
@ -6,6 +6,6 @@ Here's a collection of comprehensive guides, examples, and resources for buildin

 Try out Llama Stack's capabilities through our detailed Jupyter notebooks:

-* [Building AI Applications Notebook](./notebooks/Llama_Stack_Building_AI_Applications.ipynb) - A comprehensive guide to building production-ready AI applications using Llama Stack
+* [Building AI Applications Notebook](./getting_started.ipynb) - A comprehensive guide to building production-ready AI applications using Llama Stack
 * [Benchmark Evaluations Notebook](./notebooks/Llama_Stack_Benchmark_Evals.ipynb) - Detailed performance evaluations and benchmarking results
-* [Zero-to-Hero Guide](./notebooks/Llama_Stack_Zero_to_Hero_Guide.ipynb) - Step-by-step guide for getting started with Llama Stack
+* [Zero-to-Hero Guide](./zero_to_hero_guide) - Step-by-step guide for getting started with Llama Stack
--- a/docs/source/building_applications/agent_execution_loop.md
+++ b/docs/source/building_applications/agent_execution_loop.md
@ -77,7 +77,7 @@ agent_config = AgentConfig(
    instructions="You are a helpful assistant",
    # Enable both RAG and tool usage
    toolgroups=[
-        {"name": "builtin::rag", "args": {"vector_db_ids": ["my_docs"]}}.
+        {"name": "builtin::rag", "args": {"vector_db_ids": ["my_docs"]}},
        "builtin::code_interpreter",
    ],
    # Configure safety
@ -86,13 +86,9 @@ agent_config = AgentConfig(
    # Control the inference loop
    max_infer_iters=5,
    sampling_params={
-        "strategy": {
-            "type": "top_p",
-            "temperature": 0.7,
-            "top_p": 0.95
-        },
-        "max_tokens": 2048
-    }
+        "strategy": {"type": "top_p", "temperature": 0.7, "top_p": 0.95},
+        "max_tokens": 2048,
+    },
 )

 agent = Agent(client, agent_config)
@ -101,11 +97,13 @@ session_id = agent.create_session("monitored_session")
 # Stream the agent's execution steps
 response = agent.create_turn(
    messages=[{"role": "user", "content": "Analyze this code and run it"}],
-    attachments=[{
-        "content": "https://raw.githubusercontent.com/example/code.py",
-        "mime_type": "text/plain"
-    }],
-    session_id=session_id
+    attachments=[
+        {
+            "content": "https://raw.githubusercontent.com/example/code.py",
+            "mime_type": "text/plain",
+        }
+    ],
+    session_id=session_id,
 )

 # Monitor each step of execution
--- a/docs/source/building_applications/evals.md
+++ b/docs/source/building_applications/evals.md
@ -15,6 +15,7 @@ This first example walks you through how to evaluate a model candidate served by

 ```python
 import datasets
+
 ds = datasets.load_dataset(path="llamastack/mmmu", name="Agriculture", split="dev")
 ds = ds.select_columns(["chat_completion_input", "input_query", "expected_answer"])
 eval_rows = ds.to_pandas().to_dict(orient="records")
@ -43,7 +44,7 @@ system_message = {
 client.eval_tasks.register(
    eval_task_id="meta-reference::mmmu",
    dataset_id=f"mmmu-{subset}-{split}",
-    scoring_functions=["basic::regex_parser_multiple_choice_answer"]
+    scoring_functions=["basic::regex_parser_multiple_choice_answer"],
 )

 response = client.eval.evaluate_rows(
@ -62,9 +63,9 @@ response = client.eval.evaluate_rows(
                "max_tokens": 4096,
                "repeat_penalty": 1.0,
            },
-            "system_message": system_message
-        }
-    }
+            "system_message": system_message,
+        },
+    },
 )
 ```

@ -88,7 +89,7 @@ _ = client.datasets.register(
        "input_query": {"type": "string"},
        "expected_answer": {"type": "string"},
        "chat_completion_input": {"type": "chat_completion_input"},
-    }
+    },
 )

 eval_rows = client.datasetio.get_rows_paginated(
@ -101,7 +102,7 @@ eval_rows = client.datasetio.get_rows_paginated(
 client.eval_tasks.register(
    eval_task_id="meta-reference::simpleqa",
    dataset_id=simpleqa_dataset_id,
-    scoring_functions=["llm-as-judge::405b-simpleqa"]
+    scoring_functions=["llm-as-judge::405b-simpleqa"],
 )

 response = client.eval.evaluate_rows(
@ -120,8 +121,8 @@ response = client.eval.evaluate_rows(
                "max_tokens": 4096,
                "repeat_penalty": 1.0,
            },
-        }
-    }
+        },
+    },
 )
 ```

@ -144,14 +145,14 @@ agent_config = {
        {
            "type": "brave_search",
            "engine": "tavily",
-            "api_key": userdata.get("TAVILY_SEARCH_API_KEY")
+            "api_key": userdata.get("TAVILY_SEARCH_API_KEY"),
        }
    ],
    "tool_choice": "auto",
    "tool_prompt_format": "json",
    "input_shields": [],
    "output_shields": [],
-    "enable_session_persistence": False
+    "enable_session_persistence": False,
 }

 response = client.eval.evaluate_rows(
@ -163,7 +164,7 @@ response = client.eval.evaluate_rows(
        "eval_candidate": {
            "type": "agent",
            "config": agent_config,
-        }
-    }
+        },
+    },
 )
 ```
--- a/docs/source/building_applications/evaluation.md
+++ b/docs/source/building_applications/evaluation.md
@ -13,7 +13,7 @@ Here's how to set up basic evaluation:
 response = client.eval_tasks.register(
    eval_task_id="my_eval",
    dataset_id="my_dataset",
-    scoring_functions=["accuracy", "relevance"]
+    scoring_functions=["accuracy", "relevance"],
 )

 # Run evaluation
@ -21,16 +21,10 @@ job = client.eval.run_eval(
    task_id="my_eval",
    task_config={
        "type": "app",
-        "eval_candidate": {
-            "type": "agent",
-            "config": agent_config
-        }
-    }
+        "eval_candidate": {"type": "agent", "config": agent_config},
+    },
 )

 # Get results
-result = client.eval.job_result(
-    task_id="my_eval",
-    job_id=job.job_id
-)
+result = client.eval.job_result(task_id="my_eval", job_id=job.job_id)
 ```
--- a/docs/source/building_applications/index.md
+++ b/docs/source/building_applications/index.md
@ -4,7 +4,7 @@ Llama Stack provides all the building blocks needed to create sophisticated AI a

 The best way to get started is to look at this notebook which walks through the various APIs (from basic inference, to RAG agents) and how to use them.

-**Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb)
+**Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)

 Here are some key topics that will help you build effective agents:

--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@ -34,15 +34,15 @@ chunks = [
    {
        "document_id": "doc1",
        "content": "Your document text here",
-        "mime_type": "text/plain"
+        "mime_type": "text/plain",
    },
-    ...
 ]
-client.vector_io.insert(vector_db_id, chunks)
+client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)

 # You can then query for these chunks
-chunks_response = client.vector_io.query(vector_db_id, query="What do you know about...")
-
+chunks_response = client.vector_io.query(
+    vector_db_id=vector_db_id, query="What do you know about..."
+)
 ```

 ### Using the RAG Tool
@ -71,8 +71,8 @@ client.tool_runtime.rag_tool.insert(

 # Query documents
 results = client.tool_runtime.rag_tool.query(
-    vector_db_id=vector_db_id,
-    query="What do you know about...",
+    vector_db_ids=[vector_db_id],
+    content="What do you know about...",
 )
 ```

@ -81,19 +81,22 @@ results = client.tool_runtime.rag_tool.query(
 One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:

 ```python
+from llama_stack_client.types.agent_create_params import AgentConfig
+from llama_stack_client.lib.agents.agent import Agent

 # Configure agent with memory
 agent_config = AgentConfig(
-    model="Llama3.2-3B-Instruct",
+    model="meta-llama/Llama-3.2-3B-Instruct",
    instructions="You are a helpful assistant",
+    enable_session_persistence=False,
    toolgroups=[
        {
            "name": "builtin::rag",
            "args": {
                "vector_db_ids": [vector_db_id],
-            }
+            },
        }
-    ]
+    ],
 )

 agent = Agent(client, agent_config)
@ -101,25 +104,21 @@ session_id = agent.create_session("rag_session")

 # Initial document ingestion
 response = agent.create_turn(
-    messages=[{
-        "role": "user",
-        "content": "I am providing some documents for reference."
-    }],
-    documents=[
-        dict(
-            content="https://raw.githubusercontent.com/example/doc.rst",
-            mime_type="text/plain"
-        )
+    messages=[
+        {"role": "user", "content": "I am providing some documents for reference."}
    ],
-    session_id=session_id
+    documents=[
+        {
+            "content": "https://raw.githubusercontent.com/example/doc.rst",
+            "mime_type": "text/plain",
+        }
+    ],
+    session_id=session_id,
 )

 # Query with RAG
 response = agent.create_turn(
-    messages=[{
-        "role": "user",
-        "content": "What are the key topics in the documents?"
-    }],
-    session_id=session_id
+    messages=[{"role": "user", "content": "What are the key topics in the documents?"}],
+    session_id=session_id,
 )
 ```
--- a/docs/source/building_applications/safety.md
+++ b/docs/source/building_applications/safety.md
@ -5,15 +5,11 @@ Safety is a critical component of any AI application. Llama Stack provides a Shi
 ```python
 # Register a safety shield
 shield_id = "content_safety"
-client.shields.register(
-    shield_id=shield_id,
-    provider_shield_id="llama-guard-basic"
-)
+client.shields.register(shield_id=shield_id, provider_shield_id="llama-guard-basic")

 # Run content through shield
 response = client.safety.run_shield(
-    shield_id=shield_id,
-    messages=[{"role": "user", "content": "User message here"}]
+    shield_id=shield_id, messages=[{"role": "user", "content": "User message here"}]
 )

 if response.violation:
--- a/docs/source/building_applications/telemetry.md
+++ b/docs/source/building_applications/telemetry.md
@ -8,24 +8,16 @@ The telemetry system supports three main types of events:
 - **Unstructured Log Events**: Free-form log messages with severity levels
 ```python
 unstructured_log_event = UnstructuredLogEvent(
-    message="This is a log message",
-    severity=LogSeverity.INFO
+    message="This is a log message", severity=LogSeverity.INFO
 )
 ```
 - **Metric Events**: Numerical measurements with units
 ```python
-metric_event = MetricEvent(
-    metric="my_metric",
-    value=10,
-    unit="count"
-)
+metric_event = MetricEvent(metric="my_metric", value=10, unit="count")
 ```
 - **Structured Log Events**: System events like span start/end. Extensible to add more structured log types.
 ```python
-structured_log_event = SpanStartPayload(
-    name="my_span",
-    parent_span_id="parent_span_id"
-)
+structured_log_event = SpanStartPayload(name="my_span", parent_span_id="parent_span_id")
 ```

 ### Spans and Traces
--- a/docs/source/building_applications/tools.md
+++ b/docs/source/building_applications/tools.md
@ -35,7 +35,7 @@ Example client SDK call to register a "websearch" toolgroup that is provided by
 client.toolgroups.register(
    toolgroup_id="builtin::websearch",
    provider_id="brave-search",
-    args={"max_results": 5}
+    args={"max_results": 5},
 )
 ```

@ -50,8 +50,7 @@ The Code Interpreter allows execution of Python code within a controlled environ
 ```python
 # Register Code Interpreter tool group
 client.toolgroups.register(
-    toolgroup_id="builtin::code_interpreter",
-    provider_id="code_interpreter"
+    toolgroup_id="builtin::code_interpreter", provider_id="code_interpreter"
 )
 ```

@ -68,16 +67,14 @@ The WolframAlpha tool provides access to computational knowledge through the Wol
 ```python
 # Register WolframAlpha tool group
 client.toolgroups.register(
-    toolgroup_id="builtin::wolfram_alpha",
-    provider_id="wolfram-alpha"
+    toolgroup_id="builtin::wolfram_alpha", provider_id="wolfram-alpha"
 )
 ```

 Example usage:
 ```python
 result = client.tool_runtime.invoke_tool(
-    tool_name="wolfram_alpha",
-    args={"query": "solve x^2 + 2x + 1 = 0"}
+    tool_name="wolfram_alpha", args={"query": "solve x^2 + 2x + 1 = 0"}
 )
 ```

@ -90,10 +87,7 @@ The Memory tool enables retrieval of context from various types of memory banks
 client.toolgroups.register(
    toolgroup_id="builtin::memory",
    provider_id="memory",
-    args={
-        "max_chunks": 5,
-        "max_tokens_in_context": 4096
-    }
+    args={"max_chunks": 5, "max_tokens_in_context": 4096},
 )
 ```

@ -136,9 +130,7 @@ config = AgentConfig(
    toolgroups=[
        "builtin::websearch",
    ],
-    client_tools=[
-        ToolDef(name="client_tool", description="Client provided tool")
-    ]
+    client_tools=[ToolDef(name="client_tool", description="Client provided tool")],
 )
 ```

@ -167,9 +159,9 @@ Example tool definition:
            "name": "query",
            "parameter_type": "string",
            "description": "The query to search for",
-            "required": True
+            "required": True,
        }
-    ]
+    ],
 }
 ```

@ -179,8 +171,7 @@ Tools can be invoked using the `invoke_tool` method:

 ```python
 result = client.tool_runtime.invoke_tool(
-    tool_name="web_search",
-    kwargs={"query": "What is the capital of France?"}
+    tool_name="web_search", kwargs={"query": "What is the capital of France?"}
 )
 ```

--- a/docs/source/concepts/index.md
+++ b/docs/source/concepts/index.md
@ -62,10 +62,3 @@ While there is a lot of flexibility to mix-and-match providers, often users will


 **On-device Distro**: Finally, you may want to run Llama Stack directly on an edge device (mobile phone or a tablet.) We provide Distros for iOS and Android (coming soon.)
-
-```{toctree}
-:maxdepth: 1
-:hidden:
-
-distributions/index
-```
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -68,6 +68,7 @@ myst_substitutions = {
    "docker_hub": "https://hub.docker.com/repository/docker/llamastack",
 }

+suppress_warnings = ['myst.header']

 # Copy button settings
 copybutton_prompt_text = "$ "  # for bash prompts
@ -94,22 +95,6 @@ html_static_path = ["../_static"]
 # html_logo = "../_static/llama-stack-logo.png"
 html_style = "../_static/css/my_theme.css"

-redoc = [
-    {
-        "name": "Llama Stack API",
-        "page": "references/api_reference/index",
-        "spec": "../resources/llama-stack-spec.yaml",
-        "opts": {
-            "suppress-warnings": True,
-            # "expand-responses": ["200", "201"],
-        },
-        "embed": True,
-    },
-]
-
-redoc_uri = "https://cdn.redoc.ly/redoc/latest/bundles/redoc.standalone.js"
-
-
 def setup(app):
    def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
        url = f"https://hub.docker.com/r/llamastack/{text}"
--- a/docs/source/contributing/new_api_provider.md
+++ b/docs/source/contributing/new_api_provider.md
@ -3,7 +3,7 @@
 This guide will walk you through the process of adding a new API provider to Llama Stack.


- Begin by reviewing the [core concepts](../concepts/) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
+- Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
 - Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
 - Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
 - Update any distribution {repopath}`Templates::llama_stack/templates/` build.yaml and run.yaml files if they should include your provider by default. Run {repopath}`llama_stack/scripts/distro_codegen.py` if necessary.
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -180,12 +180,45 @@ After this step is successful, you should be able to find the built container im
 ### Running your Stack server
 Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack build` step.

+```
+llama stack run -h
+usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--disable-ipv6] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE]
+                       [--tls-certfile TLS_CERTFILE] [--image-type {conda,container,venv}]
+                       config
+
+start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
+
+positional arguments:
+  config                Path to config file to use for the run
+
+options:
+  -h, --help            show this help message and exit
+  --port PORT           Port to run the server on. Defaults to 8321
+  --image-name IMAGE_NAME
+                        Name of the image to run. Defaults to the current conda environment
+  --disable-ipv6        Disable IPv6 support
+  --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.
+  --tls-keyfile TLS_KEYFILE
+                        Path to TLS key file for HTTPS
+  --tls-certfile TLS_CERTFILE
+                        Path to TLS certificate file for HTTPS
+  --image-type {conda,container,venv}
+                        Image Type used during the build. This can be either conda or container or venv.
+
+```
+
 ```
 # Start using template name
 llama stack run tgi

 # Start using config file
 llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
+
+# Start using a venv
+llama stack run --image-type venv ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
+
+# Start using a conda environment
+llama stack run --image-type conda ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
 ```

 ```
--- a/docs/source/distributions/importing_as_library.md
+++ b/docs/source/distributions/importing_as_library.md
@ -1,9 +1,9 @@
 # Using Llama Stack as a Library

 If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library. This avoids the overhead of setting up a server.
-```python
+```bash
 # setup
-pip install llama-stack
+uv pip install llama-stack
 llama stack build --template together --image-type venv
 ```

@ -13,7 +13,7 @@ from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
 client = LlamaStackAsLibraryClient(
    "ollama",
    # provider_data is optional, but if you need to pass in any provider specific data, you can do so here.
-    provider_data = {"tavily_search_api_key": os.environ['TAVILY_SEARCH_API_KEY']}
+    provider_data={"tavily_search_api_key": os.environ["TAVILY_SEARCH_API_KEY"]},
 )
 await client.initialize()
 ```
--- a/docs/source/distributions/index.md
+++ b/docs/source/distributions/index.md
@ -7,14 +7,19 @@ You can run a Llama Stack server in one of the following ways:
 This is the simplest way to get started. Using Llama Stack as a library means you do not need to start a server. This is especially useful when you are not running inference locally and relying on an external inference service (eg. fireworks, together, groq, etc.) See [Using Llama Stack as a Library](importing_as_library)


-**Docker**:
+**Container**:

-Another simple way to start interacting with Llama Stack is to just spin up docker which is pre-built with all the providers you need. We provide a number of pre-built Docker containers so you can start a Llama Stack server instantly. You can also build your own custom Docker container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](distributions/selection) for more details.
+Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.


 **Conda**:

-Lastly, if you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
+If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
+
+
+**Kubernetes**:
+
+If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.


 ```{toctree}
@ -24,4 +29,6 @@ Lastly, if you have a custom or an advanced setup or you are developing on Llama
 importing_as_library
 building_distro
 configuration
+selection
+kubernetes_deployment
 ```
--- a/docs/source/distributions/kubernetes_deployment.md
+++ b/docs/source/distributions/kubernetes_deployment.md
@ -0,0 +1,207 @@
+# Kubernetes Deployment Guide
+
+Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster. In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.
+
+First, create a local Kubernetes cluster via Kind:
+
+```bash
+kind create cluster --image kindest/node:v1.32.0 --name llama-stack-test
+```
+
+Start vLLM server as a Kubernetes Pod and Service:
+
+```bash
+cat <<EOF |kubectl apply -f -
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: vllm-models
+spec:
+  accessModes:
+    - ReadWriteOnce
+  volumeMode: Filesystem
+  resources:
+    requests:
+      storage: 50Gi
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: hf-token-secret
+type: Opaque
+data:
+  token: $(HF_TOKEN)
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-server
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: vllm
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: vllm
+    spec:
+      containers:
+      - name: llama-stack
+        image: $(VLLM_IMAGE)
+        command:
+            - bash
+            - -c
+            - |
+              MODEL="meta-llama/Llama-3.2-1B-Instruct"
+              MODEL_PATH=/app/model/$(basename $MODEL)
+              huggingface-cli login --token $HUGGING_FACE_HUB_TOKEN
+              huggingface-cli download $MODEL --local-dir $MODEL_PATH --cache-dir $MODEL_PATH
+              python3 -m vllm.entrypoints.openai.api_server --model $MODEL_PATH --served-model-name $MODEL --port 8000
+        ports:
+          - containerPort: 8000
+        volumeMounts:
+          - name: llama-storage
+            mountPath: /app/model
+        env:
+          - name: HUGGING_FACE_HUB_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: hf-token-secret
+                key: token
+      volumes:
+      - name: llama-storage
+        persistentVolumeClaim:
+          claimName: vllm-models
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-server
+spec:
+  selector:
+    app.kubernetes.io/name: vllm
+  ports:
+  - protocol: TCP
+    port: 8000
+    targetPort: 8000
+  type: ClusterIP
+EOF
+```
+
+We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
+
+```bash
+$ kubectl logs -l app.kubernetes.io/name=vllm
+...
+INFO:     Started server process [1]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
+```
+
+Then we can modify the Llama Stack run configuration YAML with the following inference provider:
+
+```yaml
+providers:
+  inference:
+  - provider_id: vllm
+    provider_type: remote::vllm
+    config:
+      url: http://vllm-server.default.svc.cluster.local:8000/v1
+      max_tokens: 4096
+      api_token: fake
+```
+
+Once we have defined the run configuration for Llama Stack, we can build an image with that configuration and the server source code:
+
+```bash
+cat >/tmp/test-vllm-llama-stack/Containerfile.llama-stack-run-k8s <<EOF
+FROM distribution-myenv:dev
+
+RUN apt-get update && apt-get install -y git
+RUN git clone https://github.com/meta-llama/llama-stack.git /app/llama-stack-source
+
+ADD ./vllm-llama-stack-run-k8s.yaml /app/config.yaml
+EOF
+podman build -f /tmp/test-vllm-llama-stack/Containerfile.llama-stack-run-k8s -t llama-stack-run-k8s /tmp/test-vllm-llama-stack
+```
+
+
+We can then start the Llama Stack server by deploying a Kubernetes Pod and Service:
+
+```bash
+cat <<EOF |kubectl apply -f -
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: llama-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llama-stack-server
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: llama-stack
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: llama-stack
+    spec:
+      containers:
+      - name: llama-stack
+        image: localhost/llama-stack-run-k8s:latest
+        imagePullPolicy: IfNotPresent
+        command: ["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]
+        ports:
+          - containerPort: 5000
+        volumeMounts:
+          - name: llama-storage
+            mountPath: /root/.llama
+      volumes:
+      - name: llama-storage
+        persistentVolumeClaim:
+          claimName: llama-pvc
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-stack-service
+spec:
+  selector:
+    app.kubernetes.io/name: llama-stack
+  ports:
+  - protocol: TCP
+    port: 5000
+    targetPort: 5000
+  type: ClusterIP
+EOF
+```
+
+We can check that the LlamaStack server has started:
+
+```bash
+$ kubectl logs -l app.kubernetes.io/name=llama-stack
+...
+INFO:     Started server process [1]
+INFO:     Waiting for application startup.
+INFO:     ASGI 'lifespan' protocol appears unsupported.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://['::', '0.0.0.0']:5000 (Press CTRL+C to quit)
+```
+
+Finally, we forward the Kubernetes service to a local port and test some inference requests against it via the Llama Stack Client:
+
+```bash
+kubectl port-forward service/llama-stack-service 5000:5000
+llama-stack-client --endpoint http://localhost:5000 inference chat-completion --message "hello, what model are you?"
+```
--- a/docs/source/distributions/remote_hosted_distro/nvidia.md
+++ b/docs/source/distributions/remote_hosted_distro/nvidia.md
@ -1,3 +1,4 @@
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # NVIDIA Distribution

 The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.
--- a/docs/source/distributions/selection.md
+++ b/docs/source/distributions/selection.md
@ -23,7 +23,7 @@ Which templates / distributions to choose depends on the hardware you have for r
  - {dockerhub}`distribution-together` ([Guide](self_hosted_distro/together))
  - {dockerhub}`distribution-fireworks` ([Guide](self_hosted_distro/fireworks))

- **Do you want to run Llama Stack inference on your iOS / Android device**  Lastly, we also provide templates for running Llama Stack inference on your iOS / Android device:
+- **Do you want to run Llama Stack inference on your iOS / Android device?**  Lastly, we also provide templates for running Llama Stack inference on your iOS / Android device:
  - [iOS SDK](ondevice_distro/ios_sdk)
  - [Android](ondevice_distro/android_sdk)

@ -43,7 +43,6 @@ self_hosted_distro/nvidia
 self_hosted_distro/ollama
 self_hosted_distro/together
 self_hosted_distro/fireworks
-ondevice_distro/index
 ```

 ### On-Device Distributions
--- a/docs/source/distributions/self_hosted_distro/bedrock.md
+++ b/docs/source/distributions/self_hosted_distro/bedrock.md
@ -1,3 +1,4 @@
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Bedrock Distribution

 ```{toctree}
--- a/docs/source/distributions/self_hosted_distro/cerebras.md
+++ b/docs/source/distributions/self_hosted_distro/cerebras.md
@ -1,3 +1,4 @@
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Cerebras Distribution

 The `llamastack/distribution-cerebras` distribution consists of the following provider configurations.
--- a/docs/source/distributions/self_hosted_distro/dell.md
+++ b/docs/source/distributions/self_hosted_distro/dell.md
@ -0,0 +1,186 @@
+---
+orphan: true
+---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
+
+# Dell Distribution of Llama Stack
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
+The `llamastack/distribution-dell` distribution consists of the following provider configurations.
+
+| API | Provider(s) |
+|-----|-------------|
+| agents | `inline::meta-reference` |
+| datasetio | `remote::huggingface`, `inline::localfs` |
+| eval | `inline::meta-reference` |
+| inference | `remote::tgi` |
+| safety | `inline::llama-guard` |
+| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
+| telemetry | `inline::meta-reference` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
+
+
+You can use this distribution if you have GPUs and want to run an independent TGI or Dell Enterprise Hub container for running inference.
+
+### Environment Variables
+
+The following environment variables can be configured:
+
+- `DEH_URL`: URL for the Dell inference server (default: `http://0.0.0.0:8181`)
+- `DEH_SAFETY_URL`: URL for the Dell safety inference server (default: `http://0.0.0.0:8282`)
+- `CHROMA_URL`: URL for the Chroma server (default: `http://localhost:6601`)
+- `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`)
+- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
+
+
+## Setting up Inference server using Dell Enterprise Hub's custom TGI container.
+
+NOTE: This is a placeholder to run inference with TGI. This will be updated to use [Dell Enterprise Hub's containers](https://dell.huggingface.co/authenticated/models) once verified.
+
+```bash
+export INFERENCE_PORT=8181
+export DEH_URL=http://0.0.0.0:$INFERENCE_PORT
+export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
+export CHROMADB_HOST=localhost
+export CHROMADB_PORT=6601
+export CHROMA_URL=http://$CHROMADB_HOST:$CHROMADB_PORT
+export CUDA_VISIBLE_DEVICES=0
+export LLAMA_STACK_PORT=8321
+
+docker run --rm -it \
+  --network host \
+  -v $HOME/.cache/huggingface:/data \
+  -e HF_TOKEN=$HF_TOKEN \
+  -p $INFERENCE_PORT:$INFERENCE_PORT \
+  --gpus $CUDA_VISIBLE_DEVICES \
+  ghcr.io/huggingface/text-generation-inference \
+  --dtype bfloat16 \
+  --usage-stats off \
+  --sharded false \
+  --cuda-memory-fraction 0.7 \
+  --model-id $INFERENCE_MODEL \
+  --port $INFERENCE_PORT --hostname 0.0.0.0
+```
+
+If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
+
+```bash
+export SAFETY_INFERENCE_PORT=8282
+export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
+export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+export CUDA_VISIBLE_DEVICES=1
+
+docker run --rm -it \
+  --network host \
+  -v $HOME/.cache/huggingface:/data \
+  -e HF_TOKEN=$HF_TOKEN \
+  -p $SAFETY_INFERENCE_PORT:$SAFETY_INFERENCE_PORT \
+  --gpus $CUDA_VISIBLE_DEVICES \
+  ghcr.io/huggingface/text-generation-inference \
+  --dtype bfloat16 \
+  --usage-stats off \
+  --sharded false \
+  --cuda-memory-fraction 0.7 \
+  --model-id $SAFETY_MODEL \
+  --hostname 0.0.0.0 \
+  --port $SAFETY_INFERENCE_PORT
+```
+
+## Dell distribution relies on ChromaDB for vector database usage
+
+You can start a chroma-db easily using docker.
+```bash
+# This is where the indices are persisted
+mkdir -p $HOME/chromadb
+
+podman run --rm -it \
+  --network host \
+  --name chromadb \
+  -v $HOME/chromadb:/chroma/chroma \
+  -e IS_PERSISTENT=TRUE \
+  chromadb/chroma:latest \
+  --port $CHROMADB_PORT \
+  --host $CHROMADB_HOST
+```
+
+## Running Llama Stack
+
+Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
+```bash
+docker run -it \
+  --network host \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v $HOME/.llama:/root/.llama \
+  # NOTE: mount the llama-stack / llama-model directories if testing local changes else not needed
+  -v /home/hjshah/git/llama-stack:/app/llama-stack-source -v /home/hjshah/git/llama-models:/app/llama-models-source \
+  # localhost/distribution-dell:dev if building / testing locally
+  llamastack/distribution-dell\
+  --port $LLAMA_STACK_PORT  \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env DEH_URL=$DEH_URL \
+  --env CHROMA_URL=$CHROMA_URL
+
+```
+
+If you are using Llama Stack Safety / Shield APIs, use:
+
+```bash
+# You need a local checkout of llama-stack to run this, get it using
+# git clone https://github.com/meta-llama/llama-stack.git
+cd /path/to/llama-stack
+
+export SAFETY_INFERENCE_PORT=8282
+export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
+export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v $HOME/.llama:/root/.llama \
+  -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
+  llamastack/distribution-dell \
+  --yaml-config /root/my-run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env DEH_URL=$DEH_URL \
+  --env SAFETY_MODEL=$SAFETY_MODEL \
+  --env DEH_SAFETY_URL=$DEH_SAFETY_URL \
+  --env CHROMA_URL=$CHROMA_URL
+```
+
+### Via Conda
+
+Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
+
+```bash
+llama stack build --template dell --image-type conda
+llama stack run dell
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env DEH_URL=$DEH_URL \
+  --env CHROMA_URL=$CHROMA_URL
+```
+
+If you are using Llama Stack Safety / Shield APIs, use:
+
+```bash
+llama stack run ./run-with-safety.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env DEH_URL=$DEH_URL \
+  --env SAFETY_MODEL=$SAFETY_MODEL \
+  --env DEH_SAFETY_URL=$DEH_SAFETY_URL \
+  --env CHROMA_URL=$CHROMA_URL
+```
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@ -1,6 +1,7 @@
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Fireworks Distribution

 ```{toctree}
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@ -1,6 +1,7 @@
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Meta Reference Distribution

 ```{toctree}
@ -82,7 +83,7 @@ docker run \

 ### Via Conda

-Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
+Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.

 ```bash
 llama stack build --template meta-reference-gpu --image-type conda
--- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
@ -1,6 +1,7 @@
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Meta Reference Quantized Distribution

 ```{toctree}
@ -82,7 +83,7 @@ docker run \

 ### Via Conda

-Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
+Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.

 ```bash
 llama stack build --template meta-reference-quantized-gpu --image-type conda
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@ -1,6 +1,7 @@
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Ollama Distribution

 ```{toctree}
@ -25,7 +26,9 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


-You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Environment Variables
+You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
+
+### Environment Variables

 The following environment variables can be configured:

@ -101,7 +104,7 @@ docker run \

 ### Via Conda

-Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
+Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.

 ```bash
 export LLAMA_STACK_PORT=5001
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@ -1,6 +1,7 @@
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Remote vLLM Distribution
 ```{toctree}
 :maxdepth: 2
@ -131,7 +132,7 @@ docker run \

 ### Via Conda

-Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
+Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.

 ```bash
 export INFERENCE_PORT=8000
--- a/docs/source/distributions/self_hosted_distro/sambanova.md
+++ b/docs/source/distributions/self_hosted_distro/sambanova.md
@ -1,6 +1,7 @@
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # SambaNova Distribution

 ```{toctree}
@ -38,13 +39,15 @@ The following models are available by default:
 - `meta-llama/Llama-3.1-405B-Instruct-FP8 (Meta-Llama-3.1-405B-Instruct)`
 - `meta-llama/Llama-3.2-1B-Instruct (Meta-Llama-3.2-1B-Instruct)`
 - `meta-llama/Llama-3.2-3B-Instruct (Meta-Llama-3.2-3B-Instruct)`
+- `meta-llama/Llama-3.3-70B-Instruct (Meta-Llama-3.3-70B-Instruct)`
 - `meta-llama/Llama-3.2-11B-Vision-Instruct (Llama-3.2-11B-Vision-Instruct)`
 - `meta-llama/Llama-3.2-90B-Vision-Instruct (Llama-3.2-90B-Vision-Instruct)`
+- `meta-llama/Llama-Guard-3-8B (Meta-Llama-Guard-3-8B)`


 ### Prerequisite: API Keys

-Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaBova.ai](https://sambanova.ai/).
+Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaNova.ai](https://sambanova.ai/).


 ## Running Llama Stack with SambaNova
--- a/docs/source/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/distributions/self_hosted_distro/tgi.md
@ -1,6 +1,7 @@
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->

 # TGI Distribution

@ -122,7 +123,7 @@ docker run \

 ### Via Conda

-Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
+Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.

 ```bash
 llama stack build --template tgi --image-type conda
--- a/docs/source/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@ -1,6 +1,7 @@
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Together Distribution

 ```{toctree}
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@ -1,6 +1,6 @@
 # Quick Start

-In this guide, we'll walk through how you can use the Llama Stack (server and client SDK ) to test a simple RAG agent.
+In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to test a simple RAG agent.

 A Llama Stack agent is a simple integrated system that can perform tasks by combining a Llama model for reasoning with tools (e.g., RAG, web search, code execution, etc.) for taking actions.

@ -15,8 +15,11 @@ ollama run llama3.2:3b-instruct-fp16 --keepalive 60m

 By default, Ollama keeps the model loaded in memory for 5 minutes which can be too short. We set the `--keepalive` flag to 60 minutes to ensure the model remains loaded for sometime.

-NOTE: If you do not have ollama, you can install it from [here](https://ollama.ai/docs/installation).
+```{admonition} Note
+:class: tip

+If you do not have ollama, you can install it from [here](https://ollama.com/download).
+```


 ### 2. Pick a client environment
@ -35,15 +38,20 @@ The API is **exactly identical** for both clients.
 :::{dropdown} Starting up the Llama Stack server
 The Llama Stack server can be configured flexibly so you can mix-and-match various providers for its individual API components -- beyond Inference, these include Vector IO, Agents, Telemetry, Evals, Post Training, etc.

-To get started quickly, we provide various Docker images for the server component that work with different inference providers out of the box. For this guide, we will use `llamastack/distribution-ollama` as the Docker image.
+To get started quickly, we provide various container images for the server component that work with different inference providers out of the box. For this guide, we will use `llamastack/distribution-ollama` as the container image.

 Lets setup some environment variables that we will use in the rest of the guide.
 ```bash
-INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
-LLAMA_STACK_PORT=8321
+export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
+export LLAMA_STACK_PORT=8321
 ```

-You can start the server using the following command:
+Next you can create a local directory to mount into the container’s file system.
+```bash
+mkdir -p ~/.llama
+```
+
+Then you can start the server using the container tool of your choice.  For example, if you are running Docker you can use the following command:
 ```bash
 docker run -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
@ -53,8 +61,28 @@ docker run -it \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env OLLAMA_URL=http://host.docker.internal:11434
 ```
+
+As another example, to start the container with Podman, you can do the same but replace `docker` at the start of the command with `podman`. If you are using `podman` older than `4.7.0`, please also replace `host.docker.internal` in the `OLLAMA_URL` with `host.containers.internal`.
+
 Configuration for this is available at `distributions/ollama/run.yaml`.

+```{admonition} Note
+:class: note
+
+Docker containers run in their own isolated network namespaces on Linux. To allow the container to communicate with services running on the host via `localhost`, you need `--network=host`. This makes the container use the host’s network directly so it can connect to Ollama running on `localhost:11434`.
+
+Linux users having issues running the above command should instead try the following:
+```bash
+docker run -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  --network=host \
+  llamastack/distribution-ollama \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env OLLAMA_URL=http://localhost:11434
+```
+
 :::


@ -71,8 +99,10 @@ pip install llama-stack-client
 Let's use the `llama-stack-client` CLI to check the connectivity to the server.

 ```bash
-llama-stack-client configure --endpoint http://localhost:$LLAMA_STACK_PORT
-llama-stack-client models list
+$ llama-stack-client configure --endpoint http://localhost:$LLAMA_STACK_PORT
+> Enter the API key (leave empty if no key is needed):
+Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:8321
+$ llama-stack-client models list
 ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
 ┃ identifier                       ┃ provider_id ┃ provider_resource_id      ┃ metadata ┃
 ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩
@ -95,19 +125,30 @@ llama-stack-client \
 Here is a simple example to perform chat completions using the SDK.
 ```python
 import os
+import sys
+

 def create_http_client():
    from llama_stack_client import LlamaStackClient
-    return LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")
+
+    return LlamaStackClient(
+        base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}"
+    )
+

 def create_library_client(template="ollama"):
    from llama_stack import LlamaStackAsLibraryClient
+
    client = LlamaStackAsLibraryClient(template)
-    client.initialize()
+    if not client.initialize():
+        print("llama stack not built properly")
+        sys.exit(1)
    return client


-client = create_library_client()  # or create_http_client() depending on the environment you picked
+client = (
+    create_library_client()
+)  # or create_http_client() depending on the environment you picked

 # List available models
 models = client.models.list()
@ -120,8 +161,8 @@ response = client.inference.chat_completion(
    model_id=os.environ["INFERENCE_MODEL"],
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "Write a haiku about coding"}
-    ]
+        {"role": "user", "content": "Write a haiku about coding"},
+    ],
 )
 print(response.completion_message.content)
 ```
@ -132,6 +173,7 @@ Here is an example of a simple RAG (Retrieval Augmented Generation) chatbot agen

 ```python
 import os
+import uuid
 from termcolor import cprint

 from llama_stack_client.lib.agents.agent import Agent
@ -139,10 +181,29 @@ from llama_stack_client.lib.agents.event_logger import EventLogger
 from llama_stack_client.types.agent_create_params import AgentConfig
 from llama_stack_client.types import Document

-client = create_library_client()  # or create_http_client() depending on the environment you picked
+
+def create_http_client():
+    from llama_stack_client import LlamaStackClient
+
+    return LlamaStackClient(
+        base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}"
+    )
+
+
+def create_library_client(template="ollama"):
+    from llama_stack import LlamaStackAsLibraryClient
+
+    client = LlamaStackAsLibraryClient(template)
+    client.initialize()
+    return client
+
+
+client = (
+    create_library_client()
+)  # or create_http_client() depending on the environment you picked

 # Documents to be used for RAG
-urls = ["chat.rst", "llama3.rst", "datasets.rst", "lora_finetune.rst"]
+urls = ["chat.rst", "llama3.rst", "memory_optimizations.rst", "lora_finetune.rst"]
 documents = [
    Document(
        document_id=f"num-{i}",
@ -154,7 +215,7 @@ documents = [
 ]

 # Register a vector database
-vector_db_id = "test-vector-db"
+vector_db_id = f"test-vector-db-{uuid.uuid4().hex}"
 client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model="all-MiniLM-L6-v2",
@ -174,12 +235,12 @@ agent_config = AgentConfig(
    instructions="You are a helpful assistant",
    enable_session_persistence=False,
    # Define tools available to the agent
-    toolgroups = [
+    toolgroups=[
        {
-          "name": "builtin::rag",
-          "args" : {
-            "vector_db_ids": [vector_db_id],
-          }
+            "name": "builtin::rag",
+            "args": {
+                "vector_db_ids": [vector_db_id],
+            },
        }
    ],
 )
@ -193,7 +254,7 @@ user_prompts = [

 # Run the agent loop by calling the `create_turn` method
 for prompt in user_prompts:
-    cprint(f'User> {prompt}', 'green')
+    cprint(f"User> {prompt}", "green")
    response = rag_agent.create_turn(
        messages=[{"role": "user", "content": prompt}],
        session_id=session_id,
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -1,7 +1,8 @@
+
 ```{admonition} News
 :class: tip

-Llama Stack 0.1.0 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.0) for more details.
+Llama Stack 0.1.2 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.2) for more details.
 ```

 # Llama Stack
--- a/docs/source/references/api_reference/index.md
+++ b/docs/source/references/api_reference/index.md
@ -1,7 +1,6 @@
+{.hide-title}
 # API Reference

-```{eval-rst}
-.. sphinxcontrib-redoc:: ../resources/llama-stack-spec.yaml
-   :page-title: API Reference
-   :expand-responses: all
+```{raw} html
+   :file: ../../../_static/llama-stack-spec.html
 ```
--- a/docs/source/references/evals_reference/index.md
+++ b/docs/source/references/evals_reference/index.md
@ -12,7 +12,7 @@ This guide goes over the sets of APIs and developer experience flow of using Lla

 ## Evaluation Concepts

-The Evaluation APIs are associated with a set of Resources as shown in the following diagram. Please visit the Resources section in our [Core Concepts](../concepts/index.md) guide for better high-level understanding.
+The Evaluation APIs are associated with a set of Resources as shown in the following diagram. Please visit the Resources section in our [Core Concepts](../../concepts/index.md) guide for better high-level understanding.

 ![Eval Concepts](./resources/eval-concept.png)

@ -51,6 +51,7 @@ This first example walks you through how to evaluate a model candidate served by

 ```python
 import datasets
+
 ds = datasets.load_dataset(path="llamastack/mmmu", name="Agriculture", split="dev")
 ds = ds.select_columns(["chat_completion_input", "input_query", "expected_answer"])
 eval_rows = ds.to_pandas().to_dict(orient="records")
@ -79,7 +80,7 @@ system_message = {
 client.eval_tasks.register(
    eval_task_id="meta-reference::mmmu",
    dataset_id=f"mmmu-{subset}-{split}",
-    scoring_functions=["basic::regex_parser_multiple_choice_answer"]
+    scoring_functions=["basic::regex_parser_multiple_choice_answer"],
 )

 response = client.eval.evaluate_rows(
@ -98,9 +99,9 @@ response = client.eval.evaluate_rows(
                "max_tokens": 4096,
                "repeat_penalty": 1.0,
            },
-            "system_message": system_message
-        }
-    }
+            "system_message": system_message,
+        },
+    },
 )
 ```

@ -124,7 +125,7 @@ _ = client.datasets.register(
        "input_query": {"type": "string"},
        "expected_answer": {"type": "string"},
        "chat_completion_input": {"type": "chat_completion_input"},
-    }
+    },
 )

 eval_rows = client.datasetio.get_rows_paginated(
@ -137,7 +138,7 @@ eval_rows = client.datasetio.get_rows_paginated(
 client.eval_tasks.register(
    eval_task_id="meta-reference::simpleqa",
    dataset_id=simpleqa_dataset_id,
-    scoring_functions=["llm-as-judge::405b-simpleqa"]
+    scoring_functions=["llm-as-judge::405b-simpleqa"],
 )

 response = client.eval.evaluate_rows(
@ -156,8 +157,8 @@ response = client.eval.evaluate_rows(
                "max_tokens": 4096,
                "repeat_penalty": 1.0,
            },
-        }
-    }
+        },
+    },
 )
 ```

@ -180,14 +181,14 @@ agent_config = {
        {
            "type": "brave_search",
            "engine": "tavily",
-            "api_key": userdata.get("TAVILY_SEARCH_API_KEY")
+            "api_key": userdata.get("TAVILY_SEARCH_API_KEY"),
        }
    ],
    "tool_choice": "auto",
    "tool_prompt_format": "json",
    "input_shields": [],
    "output_shields": [],
-    "enable_session_persistence": False
+    "enable_session_persistence": False,
 }

 response = client.eval.evaluate_rows(
@ -199,8 +200,8 @@ response = client.eval.evaluate_rows(
        "eval_candidate": {
            "type": "agent",
            "config": agent_config,
-        }
-    }
+        },
+    },
 )
 ```

@ -237,7 +238,9 @@ GENERATED_RESPONSE: {generated_answer}
 EXPECTED_RESPONSE: {expected_answer}
 """

-input_query = "What are the top 5 topics that were explained? Only list succinct bullet points."
+input_query = (
+    "What are the top 5 topics that were explained? Only list succinct bullet points."
+)
 generated_answer = """
 Here are the top 5 topics that were explained in the documentation for Torchtune:

@ -268,7 +271,9 @@ scoring_params = {
    "braintrust::factuality": None,
 }

-response = client.scoring.score(input_rows=dataset_rows, scoring_functions=scoring_params)
+response = client.scoring.score(
+    input_rows=dataset_rows, scoring_functions=scoring_params
+)
 ```

 ## Running Evaluations via CLI
--- a/docs/source/references/python_sdk_reference/index.md
+++ b/docs/source/references/python_sdk_reference/index.md
@ -33,7 +33,11 @@ from llama_stack_client.types import (
 Types:

 ```python
-from llama_stack_client.types import ListToolGroupsResponse, ToolGroup, ToolgroupListResponse
+from llama_stack_client.types import (
+    ListToolGroupsResponse,
+    ToolGroup,
+    ToolgroupListResponse,
+)
 ```

 Methods:
@ -444,7 +448,11 @@ Methods:
 Types:

 ```python
-from llama_stack_client.types import EvalTask, ListEvalTasksResponse, EvalTaskListResponse
+from llama_stack_client.types import (
+    EvalTask,
+    ListEvalTasksResponse,
+    EvalTaskListResponse,
+)
 ```

 Methods:
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@ -45,7 +45,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next

 ---

-## Install Dependencies and Set Up Environmen
+## Install Dependencies and Set Up Environment

 1. **Create a Conda Environment**:
   Create a new Conda environment with Python 3.10:
@ -73,7 +73,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
   Open a new terminal and install `llama-stack`:
   ```bash
   conda activate ollama
-   pip install llama-stack==0.0.61
+   pip install llama-stack==0.1.0
   ```

 ---
@ -110,7 +110,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
      --env SAFETY_MODEL=$SAFETY_MODEL
      --env OLLAMA_URL=$OLLAMA_URL
   ```
-   Note: Everytime you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model.
+   Note: Every time you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model.

 The server will start and listen on `http://localhost:5001`.

@ -191,7 +191,7 @@ You can check the available models with the command `llama-stack-client models l

 You can also interact with the Llama Stack server using a simple Python script. Below is an example:

-### 1. Activate Conda Environmen
+### 1. Activate Conda Environment

 ```bash
 conda activate ollama
@ -208,7 +208,7 @@ In `test_llama_stack.py`, write the following code:

 ```python
 import os
-from llama_stack_client import LlamaStackClien
+from llama_stack_client import LlamaStackClient

 # Get the model ID from the environment variable
 INFERENCE_MODEL = os.environ.get("INFERENCE_MODEL")
@ -224,7 +224,7 @@ client = LlamaStackClient(base_url="http://localhost:5001")
 response = client.inference.chat_completion(
    messages=[
        {"role": "system", "content": "You are a friendly assistant."},
-        {"role": "user", "content": "Write a two-sentence poem about llama."}
+        {"role": "user", "content": "Write a two-sentence poem about llama."},
    ],
    model_id=INFERENCE_MODEL,
 )
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -15,20 +15,21 @@ from typing import (
    Literal,
    Optional,
    Protocol,
-    runtime_checkable,
    Union,
+    runtime_checkable,
 )

 from llama_models.schema_utils import json_schema_type, register_schema, webmethod
 from pydantic import BaseModel, ConfigDict, Field

-from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent, URL
+from llama_stack.apis.common.content_types import URL, ContentDelta, InterleavedContent
 from llama_stack.apis.inference import (
    CompletionMessage,
    ResponseFormat,
    SamplingParams,
    ToolCall,
    ToolChoice,
+    ToolConfig,
    ToolPromptFormat,
    ToolResponse,
    ToolResponseMessage,
@ -86,9 +87,7 @@ class ShieldCallStep(StepCommon):

@json_schema_type
 class MemoryRetrievalStep(StepCommon):
-    step_type: Literal[StepType.memory_retrieval.value] = (
-        StepType.memory_retrieval.value
-    )
+    step_type: Literal[StepType.memory_retrieval.value] = StepType.memory_retrieval.value
    vector_db_ids: str
    inserted_context: InterleavedContent

@ -118,7 +117,7 @@ class Turn(BaseModel):
    ]
    steps: List[Step]
    output_message: CompletionMessage
-    output_attachments: List[Attachment] = Field(default_factory=list)
+    output_attachments: Optional[List[Attachment]] = Field(default_factory=list)

    started_at: datetime
    completed_at: Optional[datetime] = None
@ -155,10 +154,25 @@ class AgentConfigCommon(BaseModel):
    output_shields: Optional[List[str]] = Field(default_factory=list)
    toolgroups: Optional[List[AgentToolGroup]] = Field(default_factory=list)
    client_tools: Optional[List[ToolDef]] = Field(default_factory=list)
-    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
-    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
+    tool_choice: Optional[ToolChoice] = Field(default=None, deprecated="use tool_config instead")
+    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None, deprecated="use tool_config instead")
+    tool_config: Optional[ToolConfig] = Field(default=None)

-    max_infer_iters: int = 10
+    max_infer_iters: Optional[int] = 10
+
+    def model_post_init(self, __context):
+        if self.tool_config:
+            if self.tool_choice and self.tool_config.tool_choice != self.tool_choice:
+                raise ValueError("tool_choice is deprecated. Use tool_choice in tool_config instead.")
+            if self.tool_prompt_format and self.tool_config.tool_prompt_format != self.tool_prompt_format:
+                raise ValueError("tool_prompt_format is deprecated. Use tool_prompt_format in tool_config instead.")
+        else:
+            params = {}
+            if self.tool_choice:
+                params["tool_choice"] = self.tool_choice
+            if self.tool_prompt_format:
+                params["tool_prompt_format"] = self.tool_prompt_format
+            self.tool_config = ToolConfig(**params)


@json_schema_type
@ -184,9 +198,7 @@ class AgentTurnResponseEventType(Enum):

@json_schema_type
 class AgentTurnResponseStepStartPayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.step_start.value] = (
-        AgentTurnResponseEventType.step_start.value
-    )
+    event_type: Literal[AgentTurnResponseEventType.step_start.value] = AgentTurnResponseEventType.step_start.value
    step_type: StepType
    step_id: str
    metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
@ -194,9 +206,7 @@ class AgentTurnResponseStepStartPayload(BaseModel):

@json_schema_type
 class AgentTurnResponseStepCompletePayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.step_complete.value] = (
-        AgentTurnResponseEventType.step_complete.value
-    )
+    event_type: Literal[AgentTurnResponseEventType.step_complete.value] = AgentTurnResponseEventType.step_complete.value
    step_type: StepType
    step_id: str
    step_details: Step
@ -206,9 +216,7 @@ class AgentTurnResponseStepCompletePayload(BaseModel):
 class AgentTurnResponseStepProgressPayload(BaseModel):
    model_config = ConfigDict(protected_namespaces=())

-    event_type: Literal[AgentTurnResponseEventType.step_progress.value] = (
-        AgentTurnResponseEventType.step_progress.value
-    )
+    event_type: Literal[AgentTurnResponseEventType.step_progress.value] = AgentTurnResponseEventType.step_progress.value
    step_type: StepType
    step_id: str

@ -217,17 +225,13 @@ class AgentTurnResponseStepProgressPayload(BaseModel):

@json_schema_type
 class AgentTurnResponseTurnStartPayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.turn_start.value] = (
-        AgentTurnResponseEventType.turn_start.value
-    )
+    event_type: Literal[AgentTurnResponseEventType.turn_start.value] = AgentTurnResponseEventType.turn_start.value
    turn_id: str


@json_schema_type
 class AgentTurnResponseTurnCompletePayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.turn_complete.value] = (
-        AgentTurnResponseEventType.turn_complete.value
-    )
+    event_type: Literal[AgentTurnResponseEventType.turn_complete.value] = AgentTurnResponseEventType.turn_complete.value
    turn: Turn


@ -280,6 +284,7 @@ class AgentTurnCreateRequest(AgentConfigOverridablePerTurn):
    toolgroups: Optional[List[AgentToolGroup]] = None

    stream: Optional[bool] = False
+    tool_config: Optional[ToolConfig] = None


@json_schema_type
@ -297,6 +302,16 @@ class AgentStepResponse(BaseModel):
@runtime_checkable
@trace_protocol
 class Agents(Protocol):
+    """Agents API for creating and interacting with agentic systems.
+
+    Main functionalities provided by this API:
+    - Create agents with specific instructions and ability to use tools.
+    - Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn".
+    - Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
+    - Agents can be provided with various shields (see the Safety API for more details).
+    - Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
+    """
+
    @webmethod(route="/agents", method="POST")
    async def create_agent(
        self,
@ -317,10 +332,12 @@ class Agents(Protocol):
        stream: Optional[bool] = False,
        documents: Optional[List[Document]] = None,
        toolgroups: Optional[List[AgentToolGroup]] = None,
+        tool_config: Optional[ToolConfig] = None,
    ) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]: ...

    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}", method="GET"
+        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}",
+        method="GET",
    )
    async def get_agents_turn(
        self,
--- a/llama_stack/apis/agents/event_logger.py
+++ b/llama_stack/apis/agents/event_logger.py
@ -13,7 +13,6 @@ from termcolor import cprint
 from llama_stack.apis.agents import AgentTurnResponseEventType, StepType
 from llama_stack.apis.common.content_types import ToolCallParseStatus
 from llama_stack.apis.inference import ToolResponseMessage
-
 from llama_stack.providers.utils.inference.prompt_adapter import (
    interleaved_content_as_str,
 )
@ -63,9 +62,7 @@ class EventLogger:
                if isinstance(chunk, ToolResponseMessage):
                    yield (
                        chunk,
-                        LogEvent(
-                            role="CustomTool", content=chunk.content, color="grey"
-                        ),
+                        LogEvent(role="CustomTool", content=chunk.content, color="grey"),
                    )
                continue

@ -81,17 +78,12 @@ class EventLogger:

            step_type = event.payload.step_type
            # handle safety
-            if (
-                step_type == StepType.shield_call
-                and event_type == EventType.step_complete.value
-            ):
+            if step_type == StepType.shield_call and event_type == EventType.step_complete.value:
                violation = event.payload.step_details.violation
                if not violation:
                    yield (
                        event,
-                        LogEvent(
-                            role=step_type, content="No Violation", color="magenta"
-                        ),
+                        LogEvent(role=step_type, content="No Violation", color="magenta"),
                    )
                else:
                    yield (
@ -110,9 +102,7 @@ class EventLogger:
                        # TODO: Currently this event is never received
                        yield (
                            event,
-                            LogEvent(
-                                role=step_type, content="", end="", color="yellow"
-                            ),
+                            LogEvent(role=step_type, content="", end="", color="yellow"),
                        )
                    elif event_type == EventType.step_progress.value:
                        # HACK: if previous was not step/event was not inference's step_progress
@ -125,9 +115,7 @@ class EventLogger:
                        ):
                            yield (
                                event,
-                                LogEvent(
-                                    role=step_type, content="", end="", color="yellow"
-                                ),
+                                LogEvent(role=step_type, content="", end="", color="yellow"),
                            )

                        delta = event.payload.delta
@ -161,9 +149,7 @@ class EventLogger:
                    if event_type == EventType.step_complete.value:
                        response = event.payload.step_details.model_response
                        if response.tool_calls:
-                            content = ToolUtils.encode_tool_call(
-                                response.tool_calls[0], tool_prompt_format
-                            )
+                            content = ToolUtils.encode_tool_call(response.tool_calls[0], tool_prompt_format)
                        else:
                            content = response.content
                        yield (
@ -202,10 +188,7 @@ class EventLogger:
                        ),
                    )

-            if (
-                step_type == StepType.memory_retrieval
-                and event_type == EventType.step_complete.value
-            ):
+            if step_type == StepType.memory_retrieval and event_type == EventType.step_complete.value:
                details = event.payload.step_details
                inserted_context = interleaved_content_as_str(details.inserted_context)
                content = f"fetched {len(inserted_context)} bytes from {details.vector_db_ids}"
--- a/llama_stack/apis/batch_inference/batch_inference.py
+++ b/llama_stack/apis/batch_inference/batch_inference.py
@ -7,13 +7,15 @@
 from typing import List, Optional, Protocol, runtime_checkable

 from llama_models.schema_utils import json_schema_type, webmethod
-from pydantic import BaseModel, Field
+from pydantic import BaseModel

 from llama_stack.apis.inference import (
-    CompletionMessage,
+    ChatCompletionResponse,
+    CompletionResponse,
    InterleavedContent,
    LogProbConfig,
    Message,
+    ResponseFormat,
    SamplingParams,
    ToolChoice,
    ToolDefinition,
@ -21,35 +23,14 @@ from llama_stack.apis.inference import (
 )


-@json_schema_type
-class BatchCompletionRequest(BaseModel):
-    model: str
-    content_batch: List[InterleavedContent]
-    sampling_params: Optional[SamplingParams] = SamplingParams()
-    logprobs: Optional[LogProbConfig] = None
-
-
@json_schema_type
 class BatchCompletionResponse(BaseModel):
-    completion_message_batch: List[CompletionMessage]
-
-
-@json_schema_type
-class BatchChatCompletionRequest(BaseModel):
-    model: str
-    messages_batch: List[List[Message]]
-    sampling_params: Optional[SamplingParams] = SamplingParams()
-
-    # zero-shot tool definitions as input to the model
-    tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
-    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
-    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
-    logprobs: Optional[LogProbConfig] = None
+    batch: List[CompletionResponse]


@json_schema_type
 class BatchChatCompletionResponse(BaseModel):
-    completion_message_batch: List[CompletionMessage]
+    batch: List[ChatCompletionResponse]


@runtime_checkable
@ -60,6 +41,7 @@ class BatchInference(Protocol):
        model: str,
        content_batch: List[InterleavedContent],
        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        response_format: Optional[ResponseFormat] = None,
        logprobs: Optional[LogProbConfig] = None,
    ) -> BatchCompletionResponse: ...

@ -73,5 +55,6 @@ class BatchInference(Protocol):
        tools: Optional[List[ToolDefinition]] = list,
        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
        tool_prompt_format: Optional[ToolPromptFormat] = None,
+        response_format: Optional[ResponseFormat] = None,
        logprobs: Optional[LogProbConfig] = None,
    ) -> BatchChatCompletionResponse: ...
--- a/llama_stack/apis/common/content_types.py
+++ b/llama_stack/apis/common/content_types.py
@ -4,14 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-import base64
 from enum import Enum
 from typing import Annotated, List, Literal, Optional, Union

 from llama_models.llama3.api.datatypes import ToolCall
-
 from llama_models.schema_utils import json_schema_type, register_schema
-from pydantic import BaseModel, Field, field_serializer, model_validator
+from pydantic import BaseModel, Field, model_validator


@json_schema_type
@ -20,8 +18,16 @@ class URL(BaseModel):


 class _URLOrData(BaseModel):
+    """
+    A URL or a base64 encoded string
+
+    :param url: A URL of the image or data URL in the format of data:image/{type};base64,{data}. Note that URL could have length limits.
+    :param data: base64 encoded image data as string
+    """
+
    url: Optional[URL] = None
-    data: Optional[bytes] = None
+    # data is a base64 encoded string, hint with contentEncoding=base64
+    data: Optional[str] = Field(contentEncoding="base64", default=None)

    @model_validator(mode="before")
    @classmethod
@ -30,21 +36,27 @@ class _URLOrData(BaseModel):
            return values
        return {"url": values}

-    @field_serializer("data")
-    def serialize_data(self, data: Optional[bytes], _info):
-        if data is None:
-            return None
-        return base64.b64encode(data).decode("utf-8")
-

@json_schema_type
 class ImageContentItem(BaseModel):
+    """A image content item
+
+    :param type: Discriminator type of the content item. Always "image"
+    :param image: Image as a base64 encoded string or an URL
+    """
+
    type: Literal["image"] = "image"
    image: _URLOrData


@json_schema_type
 class TextContentItem(BaseModel):
+    """A text content item
+
+    :param type: Discriminator type of the content item. Always "text"
+    :param text: Text content
+    """
+
    type: Literal["text"] = "text"
    text: str

@ -77,7 +89,6 @@ class ImageDelta(BaseModel):
    image: bytes


-@json_schema_type
 class ToolCallParseStatus(Enum):
    started = "started"
    in_progress = "in_progress"
--- a/llama_stack/apis/common/deployment_types.py
+++ b/llama_stack/apis/common/deployment_types.py
@ -8,7 +8,6 @@ from enum import Enum
 from typing import Any, Dict, Optional

 from llama_models.schema_utils import json_schema_type
-
 from pydantic import BaseModel

 from llama_stack.apis.common.content_types import URL
--- a/llama_stack/apis/datasetio/datasetio.py
+++ b/llama_stack/apis/datasetio/datasetio.py
@ -39,6 +39,4 @@ class DatasetIO(Protocol):
    ) -> PaginatedRowsResult: ...

    @webmethod(route="/datasetio/rows", method="POST")
-    async def append_rows(
-        self, dataset_id: str, rows: List[Dict[str, Any]]
-    ) -> None: ...
+    async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: ...
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@ -58,7 +58,7 @@ class Datasets(Protocol):
        metadata: Optional[Dict[str, Any]] = None,
    ) -> None: ...

-    @webmethod(route="/datasets/{dataset_id}", method="GET")
+    @webmethod(route="/datasets/{dataset_id:path}", method="GET")
    async def get_dataset(
        self,
        dataset_id: str,
@ -67,7 +67,7 @@ class Datasets(Protocol):
    @webmethod(route="/datasets", method="GET")
    async def list_datasets(self) -> ListDatasetsResponse: ...

-    @webmethod(route="/datasets/{dataset_id}", method="DELETE")
+    @webmethod(route="/datasets/{dataset_id:path}", method="DELETE")
    async def unregister_dataset(
        self,
        dataset_id: str,
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@ -63,9 +63,7 @@ class AppEvalTaskConfig(BaseModel):


 EvalTaskConfig = register_schema(
-    Annotated[
-        Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")
-    ],
+    Annotated[Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")],
    name="EvalTaskConfig",
 )

--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -13,8 +13,8 @@ from typing import (
    Literal,
    Optional,
    Protocol,
-    runtime_checkable,
    Union,
+    runtime_checkable,
 )

 from llama_models.llama3.api.datatypes import (
@ -31,15 +31,27 @@ from typing_extensions import Annotated

 from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
 from llama_stack.apis.models import Model
+from llama_stack.apis.telemetry.telemetry import MetricResponseMixin
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol


 class LogProbConfig(BaseModel):
+    """
+
+    :param top_k: How many tokens (for each position) to return log probabilities for.
+    """
+
    top_k: Optional[int] = 0


-@json_schema_type
 class QuantizationType(Enum):
+    """Type of model quantization to run inference with.
+
+    :cvar bf16: BFloat16 typically this means _no_ quantization
+    :cvar fp8: 8-bit floating point quantization
+    :cvar int4: 4-bit integer quantization
+    """
+
    bf16 = "bf16"
    fp8 = "fp8"
    int4 = "int4"
@ -57,6 +69,12 @@ class Bf16QuantizationConfig(BaseModel):

@json_schema_type
 class Int4QuantizationConfig(BaseModel):
+    """Configuration for 4-bit integer quantization.
+
+    :param type: Must be "int4" to identify this quantization type
+    :param scheme: Quantization scheme to use. Defaults to "int4_weight_int8_dynamic_activation"
+    """
+
    type: Literal["int4"] = "int4"
    scheme: Optional[str] = "int4_weight_int8_dynamic_activation"

@ -69,6 +87,13 @@ QuantizationConfig = Annotated[

@json_schema_type
 class UserMessage(BaseModel):
+    """A message from the user in a chat conversation.
+
+    :param role: Must be "user" to identify this as a user message
+    :param content: The content of the message, which can include text and other media
+    :param context: (Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future.
+    """
+
    role: Literal["user"] = "user"
    content: InterleavedContent
    context: Optional[InterleavedContent] = None
@ -76,15 +101,27 @@ class UserMessage(BaseModel):

@json_schema_type
 class SystemMessage(BaseModel):
+    """A system message providing instructions or context to the model.
+
+    :param role: Must be "system" to identify this as a system message
+    :param content: The content of the "system prompt". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions).
+    """
+
    role: Literal["system"] = "system"
    content: InterleavedContent


@json_schema_type
 class ToolResponseMessage(BaseModel):
+    """A message representing the result of a tool invocation.
+
+    :param role: Must be "tool" to identify this as a tool response
+    :param call_id: Unique identifier for the tool call this response is for
+    :param tool_name: Name of the tool that was called
+    :param content: The response content from the tool
+    """
+
    role: Literal["tool"] = "tool"
-    # it was nice to re-use the ToolResponse type, but having all messages
-    # have a `content` type makes things nicer too
    call_id: str
    tool_name: Union[BuiltinTool, str]
    content: InterleavedContent
@ -92,10 +129,21 @@ class ToolResponseMessage(BaseModel):

@json_schema_type
 class CompletionMessage(BaseModel):
+    """A message containing the model's (assistant) response in a chat conversation.
+
+    :param role: Must be "assistant" to identify this as the model's response
+    :param content: The content of the model's response
+    :param stop_reason: Reason why the model stopped generating. Options are:
+        - `StopReason.end_of_turn`: The model finished generating the entire response.
+        - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response.
+        - `StopReason.out_of_tokens`: The model ran out of token budget.
+    :param tool_calls: List of tool calls. Each tool call is a ToolCall object.
+    """
+
    role: Literal["assistant"] = "assistant"
    content: InterleavedContent
    stop_reason: StopReason
-    tool_calls: List[ToolCall] = Field(default_factory=list)
+    tool_calls: Optional[List[ToolCall]] = Field(default_factory=list)


 Message = register_schema(
@ -129,19 +177,35 @@ class ToolResponse(BaseModel):
        return v


-@json_schema_type
 class ToolChoice(Enum):
+    """Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model.
+
+    :cvar auto: The model may use tools if it determines that is appropriate.
+    :cvar required: The model must use tools.
+    """
+
    auto = "auto"
    required = "required"


@json_schema_type
 class TokenLogProbs(BaseModel):
+    """Log probabilities for generated tokens.
+
+    :param logprobs_by_token: Dictionary mapping tokens to their log probabilities
+    """
+
    logprobs_by_token: Dict[str, float]


-@json_schema_type
 class ChatCompletionResponseEventType(Enum):
+    """Types of events that can occur during chat completion.
+
+    :cvar start: Inference has started
+    :cvar complete: Inference is complete and a full response is available
+    :cvar progress: Inference is in progress and a partial response is available
+    """
+
    start = "start"
    complete = "complete"
    progress = "progress"
@ -149,7 +213,13 @@ class ChatCompletionResponseEventType(Enum):

@json_schema_type
 class ChatCompletionResponseEvent(BaseModel):
-    """Chat completion response event."""
+    """An event during chat completion generation.
+
+    :param event_type: Type of the event
+    :param delta: Content generated since last event. This can be one or more tokens, or a tool call.
+    :param logprobs: Optional log probabilities for generated tokens
+    :param stop_reason: Optional reason why generation stopped, if complete
+    """

    event_type: ChatCompletionResponseEventType
    delta: ContentDelta
@ -157,22 +227,37 @@ class ChatCompletionResponseEvent(BaseModel):
    stop_reason: Optional[StopReason] = None


-@json_schema_type
 class ResponseFormatType(Enum):
+    """Types of formats for structured (guided) decoding.
+
+    :cvar json_schema: Response should conform to a JSON schema. In a Python SDK, this is often a `pydantic` model.
+    :cvar grammar: Response should conform to a BNF grammar
+    """
+
    json_schema = "json_schema"
    grammar = "grammar"


@json_schema_type
 class JsonSchemaResponseFormat(BaseModel):
-    type: Literal[ResponseFormatType.json_schema.value] = (
-        ResponseFormatType.json_schema.value
-    )
+    """Configuration for JSON schema-guided response generation.
+
+    :param type: Must be "json_schema" to identify this format type
+    :param json_schema: The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model.
+    """
+
+    type: Literal[ResponseFormatType.json_schema.value] = ResponseFormatType.json_schema.value
    json_schema: Dict[str, Any]


@json_schema_type
 class GrammarResponseFormat(BaseModel):
+    """Configuration for grammar-guided response generation.
+
+    :param type: Must be "grammar" to identify this format type
+    :param bnf: The BNF grammar specification the response should conform to
+    """
+
    type: Literal[ResponseFormatType.grammar.value] = ResponseFormatType.grammar.value
    bnf: Dict[str, Any]

@ -186,20 +271,24 @@ ResponseFormat = register_schema(
 )


-@json_schema_type
+# This is an internally used class
 class CompletionRequest(BaseModel):
    model: str
    content: InterleavedContent
    sampling_params: Optional[SamplingParams] = SamplingParams()
    response_format: Optional[ResponseFormat] = None
-
    stream: Optional[bool] = False
    logprobs: Optional[LogProbConfig] = None


@json_schema_type
 class CompletionResponse(BaseModel):
-    """Completion response."""
+    """Response from a completion request.
+
+    :param content: The generated completion text
+    :param stop_reason: Reason why generation stopped
+    :param logprobs: Optional log probabilities for generated tokens
+    """

    content: str
    stop_reason: StopReason
@ -208,80 +297,95 @@ class CompletionResponse(BaseModel):

@json_schema_type
 class CompletionResponseStreamChunk(BaseModel):
-    """streamed completion response."""
+    """A chunk of a streamed completion response.
+
+    :param delta: New content generated since last chunk. This can be one or more tokens.
+    :param stop_reason: Optional reason why generation stopped, if complete
+    :param logprobs: Optional log probabilities for generated tokens
+    """

    delta: str
    stop_reason: Optional[StopReason] = None
    logprobs: Optional[List[TokenLogProbs]] = None


-@json_schema_type
-class BatchCompletionRequest(BaseModel):
-    model: str
-    content_batch: List[InterleavedContent]
-    sampling_params: Optional[SamplingParams] = SamplingParams()
-    response_format: Optional[ResponseFormat] = None
-    logprobs: Optional[LogProbConfig] = None
+class SystemMessageBehavior(Enum):
+    """Config for how to override the default system prompt.
+
+    :cvar append: Appends the provided system message to the default system prompt:
+        https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2/#-function-definitions-in-the-system-prompt-
+    :cvar replace: Replaces the default system prompt with the provided system message. The system message can include the string
+        '{{function_definitions}}' to indicate where the function definitions should be inserted.
+    """
+
+    append = "append"
+    replace = "replace"


@json_schema_type
-class BatchCompletionResponse(BaseModel):
-    """Batch completion response."""
+class ToolConfig(BaseModel):
+    """Configuration for tool use.

-    batch: List[CompletionResponse]
+    :param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
+    :param tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model.
+        - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
+        - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag.
+        - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls.
+    :param system_message_behavior: (Optional) Config for how to override the default system prompt.
+        - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt.
+        - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string
+            '{{function_definitions}}' to indicate where the function definitions should be inserted.
+    """
+
+    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
+    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
+    system_message_behavior: SystemMessageBehavior = Field(default=SystemMessageBehavior.append)


+# This is an internally used class
@json_schema_type
 class ChatCompletionRequest(BaseModel):
    model: str
    messages: List[Message]
    sampling_params: Optional[SamplingParams] = SamplingParams()

-    # zero-shot tool definitions as input to the model
    tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
-    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
-    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
-    response_format: Optional[ResponseFormat] = None
+    tool_config: Optional[ToolConfig] = Field(default_factory=ToolConfig)

+    response_format: Optional[ResponseFormat] = None
    stream: Optional[bool] = False
    logprobs: Optional[LogProbConfig] = None


@json_schema_type
-class ChatCompletionResponseStreamChunk(BaseModel):
-    """SSE-stream of these events."""
+class ChatCompletionResponseStreamChunk(MetricResponseMixin, BaseModel):
+    """A chunk of a streamed chat completion response.
+
+    :param event: The event containing the new content
+    """

    event: ChatCompletionResponseEvent


@json_schema_type
-class ChatCompletionResponse(BaseModel):
-    """Chat completion response."""
+class ChatCompletionResponse(MetricResponseMixin, BaseModel):
+    """Response from a chat completion request.
+
+    :param completion_message: The complete response message
+    :param logprobs: Optional log probabilities for generated tokens
+    """

    completion_message: CompletionMessage
    logprobs: Optional[List[TokenLogProbs]] = None


-@json_schema_type
-class BatchChatCompletionRequest(BaseModel):
-    model: str
-    messages_batch: List[List[Message]]
-    sampling_params: Optional[SamplingParams] = SamplingParams()
-
-    # zero-shot tool definitions as input to the model
-    tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
-    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
-    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
-    logprobs: Optional[LogProbConfig] = None
-
-
-@json_schema_type
-class BatchChatCompletionResponse(BaseModel):
-    batch: List[ChatCompletionResponse]
-
-
@json_schema_type
 class EmbeddingsResponse(BaseModel):
+    """Response containing generated embeddings.
+
+    :param embeddings: List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
+    """
+
    embeddings: List[List[float]]


@ -292,6 +396,13 @@ class ModelStore(Protocol):
@runtime_checkable
@trace_protocol
 class Inference(Protocol):
+    """Llama Stack Inference API for generating completions, chat completions, and embeddings.
+
+    This API provides the raw interface to the underlying models. Two kinds of models are supported:
+    - LLM models: these models generate "raw" and "chat" (conversational) completions.
+    - Embedding models: these models generate embeddings to be used for semantic search.
+    """
+
    model_store: ModelStore

    @webmethod(route="/inference/completion", method="POST")
@ -303,7 +414,19 @@ class Inference(Protocol):
        response_format: Optional[ResponseFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
-    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]: ...
+    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
+        """Generate a completion for the given content using the specified model.
+
+        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
+        :param content: The content to generate a completion for
+        :param sampling_params: (Optional) Parameters to control the sampling strategy
+        :param response_format: (Optional) Grammar specification for guided (structured) decoding
+        :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
+        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
+        :returns: If stream=False, returns a CompletionResponse with the full completion.
+                 If stream=True, returns an SSE event stream of CompletionResponseStreamChunk
+        """
+        ...

    @webmethod(route="/inference/chat-completion", method="POST")
    async def chat_completion(
@ -311,20 +434,50 @@ class Inference(Protocol):
        model_id: str,
        messages: List[Message],
        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        # zero-shot tool definitions as input to the model
        tools: Optional[List[ToolDefinition]] = None,
        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
        tool_prompt_format: Optional[ToolPromptFormat] = None,
        response_format: Optional[ResponseFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
-    ) -> Union[
-        ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]
-    ]: ...
+        tool_config: Optional[ToolConfig] = None,
+    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
+        """Generate a chat completion for the given messages using the specified model.
+
+        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
+        :param messages: List of messages in the conversation
+        :param sampling_params: Parameters to control the sampling strategy
+        :param tools: (Optional) List of tool definitions available to the model
+        :param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
+            .. deprecated::
+               Use tool_config instead.
+        :param tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model.
+            - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
+            - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag.
+            - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls.
+            .. deprecated::
+               Use tool_config instead.
+        :param response_format: (Optional) Grammar specification for guided (structured) decoding. There are two options:
+            - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format.
+            - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it.
+        :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
+        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
+        :param tool_config: (Optional) Configuration for tool use.
+        :returns: If stream=False, returns a ChatCompletionResponse with the full completion.
+                 If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk
+        """
+        ...

    @webmethod(route="/inference/embeddings", method="POST")
    async def embeddings(
        self,
        model_id: str,
        contents: List[InterleavedContent],
-    ) -> EmbeddingsResponse: ...
+    ) -> EmbeddingsResponse:
+        """Generate embeddings for content pieces using the specified model.
+
+        :param model_id: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
+        :param contents: List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text.
+        :returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
+        """
+        ...
--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@ -62,7 +62,7 @@ class Models(Protocol):
    @webmethod(route="/models", method="GET")
    async def list_models(self) -> ListModelsResponse: ...

-    @webmethod(route="/models/{model_id}", method="GET")
+    @webmethod(route="/models/{model_id:path}", method="GET")
    async def get_model(
        self,
        model_id: str,
@ -78,7 +78,7 @@ class Models(Protocol):
        model_type: Optional[ModelType] = None,
    ) -> Model: ...

-    @webmethod(route="/models/{model_id}", method="DELETE")
+    @webmethod(route="/models/{model_id:path}", method="DELETE")
    async def unregister_model(
        self,
        model_id: str,
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@ -89,9 +89,7 @@ class QATFinetuningConfig(BaseModel):


 AlgorithmConfig = register_schema(
-    Annotated[
-        Union[LoraFinetuningConfig, QATFinetuningConfig], Field(discriminator="type")
-    ],
+    Annotated[Union[LoraFinetuningConfig, QATFinetuningConfig], Field(discriminator="type")],
    name="AlgorithmConfig",
 )

@ -204,14 +202,10 @@ class PostTraining(Protocol):
    async def get_training_jobs(self) -> ListPostTrainingJobsResponse: ...

    @webmethod(route="/post-training/job/status", method="GET")
-    async def get_training_job_status(
-        self, job_uuid: str
-    ) -> Optional[PostTrainingJobStatusResponse]: ...
+    async def get_training_job_status(self, job_uuid: str) -> Optional[PostTrainingJobStatusResponse]: ...

    @webmethod(route="/post-training/job/cancel", method="POST")
    async def cancel_training_job(self, job_uuid: str) -> None: ...

    @webmethod(route="/post-training/job/artifacts", method="GET")
-    async def get_training_job_artifacts(
-        self, job_uuid: str
-    ) -> Optional[PostTrainingJobArtifactsResponse]: ...
+    async def get_training_job_artifacts(self, job_uuid: str) -> Optional[PostTrainingJobArtifactsResponse]: ...
--- a/llama_stack/apis/resource.py
+++ b/llama_stack/apis/resource.py
@ -6,11 +6,9 @@

 from enum import Enum

-from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field


-@json_schema_type
 class ResourceType(Enum):
    model = "model"
    shield = "shield"
@ -25,9 +23,7 @@ class ResourceType(Enum):
 class Resource(BaseModel):
    """Base class for all Llama Stack resources"""

-    identifier: str = Field(
-        description="Unique identifier for this resource in llama stack"
-    )
+    identifier: str = Field(description="Unique identifier for this resource in llama stack")

    provider_resource_id: str = Field(
        description="Unique identifier for this resource in the provider",
@ -36,6 +32,4 @@ class Resource(BaseModel):

    provider_id: str = Field(description="ID of the provider that owns this resource")

-    type: ResourceType = Field(
-        description="Type of resource (e.g. 'model', 'shield', 'vector_db', etc.)"
-    )
+    type: ResourceType = Field(description="Type of resource (e.g. 'model', 'shield', 'vector_db', etc.)")
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -12,8 +12,8 @@ from typing import (
    Literal,
    Optional,
    Protocol,
-    runtime_checkable,
    Union,
+    runtime_checkable,
 )

 from llama_models.schema_utils import json_schema_type, register_schema, webmethod
@ -43,9 +43,7 @@ class AggregationFunctionType(Enum):

@json_schema_type
 class LLMAsJudgeScoringFnParams(BaseModel):
-    type: Literal[ScoringFnParamsType.llm_as_judge.value] = (
-        ScoringFnParamsType.llm_as_judge.value
-    )
+    type: Literal[ScoringFnParamsType.llm_as_judge.value] = ScoringFnParamsType.llm_as_judge.value
    judge_model: str
    prompt_template: Optional[str] = None
    judge_score_regexes: Optional[List[str]] = Field(
@ -60,9 +58,7 @@ class LLMAsJudgeScoringFnParams(BaseModel):

@json_schema_type
 class RegexParserScoringFnParams(BaseModel):
-    type: Literal[ScoringFnParamsType.regex_parser.value] = (
-        ScoringFnParamsType.regex_parser.value
-    )
+    type: Literal[ScoringFnParamsType.regex_parser.value] = ScoringFnParamsType.regex_parser.value
    parsing_regexes: Optional[List[str]] = Field(
        description="Regex to extract the answer from generated response",
        default_factory=list,
@ -112,9 +108,7 @@ class CommonScoringFnFields(BaseModel):

@json_schema_type
 class ScoringFn(CommonScoringFnFields, Resource):
-    type: Literal[ResourceType.scoring_function.value] = (
-        ResourceType.scoring_function.value
-    )
+    type: Literal[ResourceType.scoring_function.value] = ResourceType.scoring_function.value

    @property
    def scoring_fn_id(self) -> str:
@ -140,10 +134,8 @@ class ScoringFunctions(Protocol):
    @webmethod(route="/scoring-functions", method="GET")
    async def list_scoring_functions(self) -> ListScoringFunctionsResponse: ...

-    @webmethod(route="/scoring-functions/{scoring_fn_id}", method="GET")
-    async def get_scoring_function(
-        self, scoring_fn_id: str, /
-    ) -> Optional[ScoringFn]: ...
+    @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="GET")
+    async def get_scoring_function(self, scoring_fn_id: str, /) -> Optional[ScoringFn]: ...

    @webmethod(route="/scoring-functions", method="POST")
    async def register_scoring_function(
--- a/llama_stack/apis/shields/shields.py
+++ b/llama_stack/apis/shields/shields.py
@ -48,7 +48,7 @@ class Shields(Protocol):
    @webmethod(route="/shields", method="GET")
    async def list_shields(self) -> ListShieldsResponse: ...

-    @webmethod(route="/shields/{identifier}", method="GET")
+    @webmethod(route="/shields/{identifier:path}", method="GET")
    async def get_shield(self, identifier: str) -> Optional[Shield]: ...

    @webmethod(route="/shields", method="POST")
--- a/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py
+++ b/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py
@ -5,11 +5,9 @@
 # the root directory of this source tree.

 from enum import Enum
-
 from typing import Any, Dict, List, Optional, Protocol, Union

 from llama_models.schema_utils import json_schema_type, webmethod
-
 from pydantic import BaseModel

 from llama_stack.apis.inference import Message
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@ -13,10 +13,11 @@ from typing import (
    Literal,
    Optional,
    Protocol,
-    runtime_checkable,
    Union,
+    runtime_checkable,
 )

+from llama_models.llama3.api.datatypes import Primitive
 from llama_models.schema_utils import json_schema_type, register_schema, webmethod
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
@ -76,7 +77,7 @@ class EventCommon(BaseModel):
    trace_id: str
    span_id: str
    timestamp: datetime
-    attributes: Optional[Dict[str, Any]] = Field(default_factory=dict)
+    attributes: Optional[Dict[str, Primitive]] = Field(default_factory=dict)


@json_schema_type
@ -94,6 +95,30 @@ class MetricEvent(EventCommon):
    unit: str


+# This is a short term solution to allow inference API to return metrics
+# The ideal way to do this is to have a way for all response types to include metrics
+# and all metric events logged to the telemetry API to be inlcuded with the response
+# To do this, we will need to augment all response types with a metrics field.
+# We have hit a blocker from stainless SDK that prevents us from doing this.
+# The blocker is that if we were to augment the response types that have a data field
+# in them like so
+# class ListModelsResponse(BaseModel):
+# metrics: Optional[List[MetricEvent]] = None
+# data: List[Models]
+# ...
+# The client SDK will need to access the data by using a .data field, which is not
+# ergonomic. Stainless SDK does support unwrapping the response type, but it
+# requires that the response type to only have a single field.
+
+# We will need a way in the client SDK to signal that the metrics are needed
+# and if they are needed, the client SDK has to return the full response type
+# without unwrapping it.
+
+
+class MetricResponseMixin(BaseModel):
+    metrics: Optional[List[MetricEvent]] = None
+
+
@json_schema_type
 class StructuredLogType(Enum):
    SPAN_START = "span_start"
@ -102,9 +127,7 @@ class StructuredLogType(Enum):

@json_schema_type
 class SpanStartPayload(BaseModel):
-    type: Literal[StructuredLogType.SPAN_START.value] = (
-        StructuredLogType.SPAN_START.value
-    )
+    type: Literal[StructuredLogType.SPAN_START.value] = StructuredLogType.SPAN_START.value
    name: str
    parent_span_id: Optional[str] = None

@ -190,9 +213,7 @@ class QuerySpanTreeResponse(BaseModel):
@runtime_checkable
 class Telemetry(Protocol):
    @webmethod(route="/telemetry/events", method="POST")
-    async def log_event(
-        self, event: Event, ttl_seconds: int = DEFAULT_TTL_DAYS * 86400
-    ) -> None: ...
+    async def log_event(self, event: Event, ttl_seconds: int = DEFAULT_TTL_DAYS * 86400) -> None: ...

    @webmethod(route="/telemetry/traces", method="GET")
    async def query_traces(
@ -203,13 +224,13 @@ class Telemetry(Protocol):
        order_by: Optional[List[str]] = None,
    ) -> QueryTracesResponse: ...

-    @webmethod(route="/telemetry/traces/{trace_id}", method="GET")
+    @webmethod(route="/telemetry/traces/{trace_id:path}", method="GET")
    async def get_trace(self, trace_id: str) -> Trace: ...

-    @webmethod(route="/telemetry/traces/{trace_id}/spans/{span_id}", method="GET")
+    @webmethod(route="/telemetry/traces/{trace_id:path}/spans/{span_id:path}", method="GET")
    async def get_span(self, trace_id: str, span_id: str) -> Span: ...

-    @webmethod(route="/telemetry/spans/{span_id}/tree", method="GET")
+    @webmethod(route="/telemetry/spans/{span_id:path}/tree", method="GET")
    async def get_span_tree(
        self,
        span_id: str,
--- a/llama_stack/apis/tools/init.py
+++ b/llama_stack/apis/tools/init.py
@ -4,5 +4,5 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from .tools import *  # noqa: F401 F403
 from .rag_tool import *  # noqa: F401 F403
+from .tools import *  # noqa: F401 F403
--- a/llama_stack/apis/tools/rag_tool.py
+++ b/llama_stack/apis/tools/rag_tool.py
@ -11,7 +11,7 @@ from llama_models.schema_utils import json_schema_type, register_schema, webmeth
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated, Protocol, runtime_checkable

-from llama_stack.apis.common.content_types import InterleavedContent, URL
+from llama_stack.apis.common.content_types import URL, InterleavedContent
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol


@ -64,9 +64,7 @@ RAGQueryGeneratorConfig = register_schema(
 class RAGQueryConfig(BaseModel):
    # This config defines how a query is generated using the messages
    # for memory bank retrieval.
-    query_generator_config: RAGQueryGeneratorConfig = Field(
-        default=DefaultRAGQueryGeneratorConfig()
-    )
+    query_generator_config: RAGQueryGeneratorConfig = Field(default=DefaultRAGQueryGeneratorConfig())
    max_tokens_in_context: int = 4096
    max_chunks: int = 5

--- a/llama_stack/apis/tools/tools.py
+++ b/llama_stack/apis/tools/tools.py
@ -11,7 +11,7 @@ from llama_models.schema_utils import json_schema_type, webmethod
 from pydantic import BaseModel, Field
 from typing_extensions import Protocol, runtime_checkable

-from llama_stack.apis.common.content_types import InterleavedContent, URL
+from llama_stack.apis.common.content_types import URL, InterleavedContent
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol

@ -101,7 +101,7 @@ class ToolGroups(Protocol):
        """Register a tool group"""
        ...

-    @webmethod(route="/toolgroups/{toolgroup_id}", method="GET")
+    @webmethod(route="/toolgroups/{toolgroup_id:path}", method="GET")
    async def get_tool_group(
        self,
        toolgroup_id: str,
@ -117,13 +117,13 @@ class ToolGroups(Protocol):
        """List tools with optional tool group"""
        ...

-    @webmethod(route="/tools/{tool_name}", method="GET")
+    @webmethod(route="/tools/{tool_name:path}", method="GET")
    async def get_tool(
        self,
        tool_name: str,
    ) -> Tool: ...

-    @webmethod(route="/toolgroups/{toolgroup_id}", method="DELETE")
+    @webmethod(route="/toolgroups/{toolgroup_id:path}", method="DELETE")
    async def unregister_toolgroup(
        self,
        toolgroup_id: str,
@ -150,8 +150,6 @@ class ToolRuntime(Protocol):
    ) -> List[ToolDef]: ...

    @webmethod(route="/tool-runtime/invoke", method="POST")
-    async def invoke_tool(
-        self, tool_name: str, kwargs: Dict[str, Any]
-    ) -> ToolInvocationResult:
+    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
        """Run a tool with the given arguments"""
        ...
--- a/llama_stack/apis/vector_dbs/vector_dbs.py
+++ b/llama_stack/apis/vector_dbs/vector_dbs.py
@ -46,7 +46,7 @@ class VectorDBs(Protocol):
    @webmethod(route="/vector-dbs", method="GET")
    async def list_vector_dbs(self) -> ListVectorDBsResponse: ...

-    @webmethod(route="/vector-dbs/{vector_db_id}", method="GET")
+    @webmethod(route="/vector-dbs/{vector_db_id:path}", method="GET")
    async def get_vector_db(
        self,
        vector_db_id: str,
@ -62,5 +62,5 @@ class VectorDBs(Protocol):
        provider_vector_db_id: Optional[str] = None,
    ) -> VectorDB: ...

-    @webmethod(route="/vector-dbs/{vector_db_id}", method="DELETE")
+    @webmethod(route="/vector-dbs/{vector_db_id:path}", method="DELETE")
    async def unregister_vector_db(self, vector_db_id: str) -> None: ...
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@ -16,11 +16,9 @@ from pathlib import Path
 from typing import Dict, List, Optional

 import httpx
-
 from llama_models.datatypes import Model
 from llama_models.sku_list import LlamaDownloadInfo
 from pydantic import BaseModel, ConfigDict
-
 from rich.console import Console
 from rich.progress import (
    BarColumn,
@ -147,9 +145,7 @@ class ParallelDownloader:
            "follow_redirects": True,
        }

-    async def retry_with_exponential_backoff(
-        self, task: DownloadTask, func, *args, **kwargs
-    ):
+    async def retry_with_exponential_backoff(self, task: DownloadTask, func, *args, **kwargs):
        last_exception = None
        for attempt in range(task.max_retries):
            try:
@ -166,13 +162,9 @@ class ParallelDownloader:
                    continue
        raise last_exception

-    async def get_file_info(
-        self, client: httpx.AsyncClient, task: DownloadTask
-    ) -> None:
+    async def get_file_info(self, client: httpx.AsyncClient, task: DownloadTask) -> None:
        async def _get_info():
-            response = await client.head(
-                task.url, headers={"Accept-Encoding": "identity"}, **self.client_options
-            )
+            response = await client.head(task.url, headers={"Accept-Encoding": "identity"}, **self.client_options)
            response.raise_for_status()
            return response

@ -201,14 +193,10 @@ class ParallelDownloader:
            return False
        return os.path.getsize(task.output_file) == task.total_size

-    async def download_chunk(
-        self, client: httpx.AsyncClient, task: DownloadTask, start: int, end: int
-    ) -> None:
+    async def download_chunk(self, client: httpx.AsyncClient, task: DownloadTask, start: int, end: int) -> None:
        async def _download_chunk():
            headers = {"Range": f"bytes={start}-{end}"}
-            async with client.stream(
-                "GET", task.url, headers=headers, **self.client_options
-            ) as response:
+            async with client.stream("GET", task.url, headers=headers, **self.client_options) as response:
                response.raise_for_status()

                with open(task.output_file, "ab") as file:
@ -225,8 +213,7 @@ class ParallelDownloader:
            await self.retry_with_exponential_backoff(task, _download_chunk)
        except Exception as e:
            raise DownloadError(
-                f"Failed to download chunk {start}-{end} after "
-                f"{task.max_retries} attempts: {str(e)}"
+                f"Failed to download chunk {start}-{end} after {task.max_retries} attempts: {str(e)}"
            ) from e

    async def prepare_download(self, task: DownloadTask) -> None:
@ -244,9 +231,7 @@ class ParallelDownloader:
                # Check if file is already downloaded
                if os.path.exists(task.output_file):
                    if self.verify_file_integrity(task):
-                        self.console.print(
-                            f"[green]Already downloaded {task.output_file}[/green]"
-                        )
+                        self.console.print(f"[green]Already downloaded {task.output_file}[/green]")
                        self.progress.update(task.task_id, completed=task.total_size)
                        return

@ -259,9 +244,7 @@ class ParallelDownloader:

                    current_pos = task.downloaded_size
                    while current_pos < task.total_size:
-                        chunk_end = min(
-                            current_pos + chunk_size - 1, task.total_size - 1
-                        )
+                        chunk_end = min(current_pos + chunk_size - 1, task.total_size - 1)
                        chunks.append((current_pos, chunk_end))
                        current_pos = chunk_end + 1

@ -273,18 +256,12 @@ class ParallelDownloader:
                    raise DownloadError(f"Download failed: {str(e)}") from e

        except Exception as e:
-            self.progress.update(
-                task.task_id, description=f"[red]Failed: {task.output_file}[/red]"
-            )
-            raise DownloadError(
-                f"Download failed for {task.output_file}: {str(e)}"
-            ) from e
+            self.progress.update(task.task_id, description=f"[red]Failed: {task.output_file}[/red]")
+            raise DownloadError(f"Download failed for {task.output_file}: {str(e)}") from e

    def has_disk_space(self, tasks: List[DownloadTask]) -> bool:
        try:
-            total_remaining_size = sum(
-                task.total_size - task.downloaded_size for task in tasks
-            )
+            total_remaining_size = sum(task.total_size - task.downloaded_size for task in tasks)
            dir_path = os.path.dirname(os.path.abspath(tasks[0].output_file))
            free_space = shutil.disk_usage(dir_path).free

@ -314,9 +291,7 @@ class ParallelDownloader:
        with self.progress:
            for task in tasks:
                desc = f"Downloading {Path(task.output_file).name}"
-                task.task_id = self.progress.add_task(
-                    desc, total=task.total_size, completed=task.downloaded_size
-                )
+                task.task_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)

            semaphore = asyncio.Semaphore(self.max_concurrent_downloads)

@ -332,9 +307,7 @@ class ParallelDownloader:
        if failed_tasks:
            self.console.print("\n[red]Some downloads failed:[/red]")
            for task, error in failed_tasks:
-                self.console.print(
-                    f"[red]- {Path(task.output_file).name}: {error}[/red]"
-                )
+                self.console.print(f"[red]- {Path(task.output_file).name}: {error}[/red]")
            raise DownloadError(f"{len(failed_tasks)} downloads failed")


@ -396,11 +369,7 @@ def _meta_download(
        output_file = str(output_dir / f)
        url = meta_url.replace("*", f"{info.folder}/{f}")
        total_size = info.pth_size if "consolidated" in f else 0
-        tasks.append(
-            DownloadTask(
-                url=url, output_file=output_file, total_size=total_size, max_retries=3
-            )
-        )
+        tasks.append(DownloadTask(url=url, output_file=output_file, total_size=total_size, max_retries=3))

    # Initialize and run parallel downloader
    downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads)
@ -446,14 +415,10 @@ def _download_from_manifest(manifest_file: str, max_concurrent_downloads: int):
        os.makedirs(output_dir, exist_ok=True)

        if any(output_dir.iterdir()):
-            console.print(
-                f"[yellow]Output directory {output_dir} is not empty.[/yellow]"
-            )
+            console.print(f"[yellow]Output directory {output_dir} is not empty.[/yellow]")

            while True:
-                resp = input(
-                    "Do you want to (C)ontinue download or (R)estart completely? (continue/restart): "
-                )
+                resp = input("Do you want to (C)ontinue download or (R)estart completely? (continue/restart): ")
                if resp.lower() in ["restart", "r"]:
                    shutil.rmtree(output_dir)
                    os.makedirs(output_dir, exist_ok=True)
@ -471,9 +436,7 @@ def _download_from_manifest(manifest_file: str, max_concurrent_downloads: int):
        ]

        # Initialize and run parallel downloader
-        downloader = ParallelDownloader(
-            max_concurrent_downloads=max_concurrent_downloads
-        )
+        downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads)
        asyncio.run(downloader.download_all(tasks))


--- a/llama_stack/cli/model/describe.py
+++ b/llama_stack/cli/model/describe.py
@ -8,7 +8,6 @@ import argparse
 import json

 from llama_models.sku_list import resolve_model
-
 from termcolor import colored

 from llama_stack.cli.subcommand import Subcommand
--- a/llama_stack/cli/model/list.py
+++ b/llama_stack/cli/model/list.py
@ -38,7 +38,7 @@ class ModelList(Subcommand):

        headers = [
            "Model Descriptor",
-            "Hugging Face Repo",
+            "Model ID",
            "Context Length",
        ]

--- a/llama_stack/cli/model/model.py
+++ b/llama_stack/cli/model/model.py
@ -11,7 +11,6 @@ from llama_stack.cli.model.download import ModelDownload
 from llama_stack.cli.model.list import ModelList
 from llama_stack.cli.model.prompt_format import ModelPromptFormat
 from llama_stack.cli.model.verify_download import ModelVerifyDownload
-
 from llama_stack.cli.subcommand import Subcommand


@ -26,6 +25,8 @@ class ModelParser(Subcommand):
            description="Work with llama models",
        )

+        self.parser.set_defaults(func=lambda args: self.parser.print_help())
+
        subparsers = self.parser.add_subparsers(title="model_subcommands")

        # Add sub-commands
--- a/llama_stack/cli/model/prompt_format.py
+++ b/llama_stack/cli/model/prompt_format.py
@ -8,7 +8,7 @@ import argparse
 import textwrap
 from io import StringIO

-from llama_models.datatypes import CoreModelId, is_multimodal, model_family, ModelFamily
+from llama_models.datatypes import CoreModelId, ModelFamily, is_multimodal, model_family

 from llama_stack.cli.subcommand import Subcommand

@ -47,33 +47,20 @@ class ModelPromptFormat(Subcommand):

        # Only Llama 3.1 and 3.2 are supported
        supported_model_ids = [
-            m
-            for m in CoreModelId
-            if model_family(m) in {ModelFamily.llama3_1, ModelFamily.llama3_2}
+            m for m in CoreModelId if model_family(m) in {ModelFamily.llama3_1, ModelFamily.llama3_2}
        ]
        model_str = "\n".join([m.value for m in supported_model_ids])
        try:
            model_id = CoreModelId(args.model_name)
        except ValueError:
-            self.parser.error(
-                f"{args.model_name} is not a valid Model. Choose one from --\n{model_str}"
-            )
+            self.parser.error(f"{args.model_name} is not a valid Model. Choose one from --\n{model_str}")

        if model_id not in supported_model_ids:
-            self.parser.error(
-                f"{model_id} is not a valid Model. Choose one from --\n {model_str}"
-            )
+            self.parser.error(f"{model_id} is not a valid Model. Choose one from --\n {model_str}")

-        llama_3_1_file = (
-            importlib.resources.files("llama_models") / "llama3_1/prompt_format.md"
-        )
-        llama_3_2_text_file = (
-            importlib.resources.files("llama_models") / "llama3_2/text_prompt_format.md"
-        )
-        llama_3_2_vision_file = (
-            importlib.resources.files("llama_models")
-            / "llama3_2/vision_prompt_format.md"
-        )
+        llama_3_1_file = importlib.resources.files("llama_models") / "llama3_1/prompt_format.md"
+        llama_3_2_text_file = importlib.resources.files("llama_models") / "llama3_2/text_prompt_format.md"
+        llama_3_2_vision_file = importlib.resources.files("llama_models") / "llama3_2/vision_prompt_format.md"
        if model_family(model_id) == ModelFamily.llama3_1:
            with importlib.resources.as_file(llama_3_1_file) as f:
                content = f.open("r").read()
--- a/llama_stack/cli/model/safety_models.py
+++ b/llama_stack/cli/model/safety_models.py
@ -9,7 +9,6 @@ from typing import Any, Dict, Optional
 from llama_models.datatypes import CheckpointQuantizationFormat
 from llama_models.llama3.api.datatypes import SamplingParams
 from llama_models.sku_list import LlamaDownloadInfo
-
 from pydantic import BaseModel, ConfigDict, Field


@ -17,16 +16,12 @@ class PromptGuardModel(BaseModel):
    """Make a 'fake' Model-like object for Prompt Guard. Eventually this will be removed."""

    model_id: str = "Prompt-Guard-86M"
-    description: str = (
-        "Prompt Guard. NOTE: this model will not be provided via `llama` CLI soon."
-    )
+    description: str = "Prompt Guard. NOTE: this model will not be provided via `llama` CLI soon."
    is_featured: bool = False
    huggingface_repo: str = "meta-llama/Prompt-Guard-86M"
    max_seq_length: int = 2048
    is_instruct_model: bool = False
-    quantization_format: CheckpointQuantizationFormat = (
-        CheckpointQuantizationFormat.bf16
-    )
+    quantization_format: CheckpointQuantizationFormat = CheckpointQuantizationFormat.bf16
    arch_args: Dict[str, Any] = Field(default_factory=dict)
    recommended_sampling_params: Optional[SamplingParams] = None

--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@ -21,8 +21,12 @@ from prompt_toolkit.validation import Validator
 from termcolor import cprint

 from llama_stack.cli.table import print_table
-
-from llama_stack.distribution.build import build_image, ImageType
+from llama_stack.distribution.build import (
+    SERVER_DEPENDENCIES,
+    ImageType,
+    build_image,
+    get_provider_dependencies,
+)
 from llama_stack.distribution.datatypes import (
    BuildConfig,
    DistributionSpec,
@ -35,7 +39,6 @@ from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
 from llama_stack.providers.datatypes import Api

-
 TEMPLATES_PATH = Path(__file__).parent.parent.parent / "templates"


@ -52,9 +55,7 @@ def available_templates_specs() -> Dict[str, BuildConfig]:
    return template_specs


-def run_stack_build_command(
-    parser: argparse.ArgumentParser, args: argparse.Namespace
-) -> None:
+def run_stack_build_command(args: argparse.Namespace) -> None:
    if args.list_templates:
        return _run_template_list_cmd()

@ -74,18 +75,11 @@ def run_stack_build_command(
            build_config.image_type = args.image_type
        else:
            cprint(
-                f"Please specify a image-type (docker | conda | venv) for {args.template}",
+                f"Please specify a image-type (container | conda | venv) for {args.template}",
                color="red",
            )
            return
-        _run_stack_build_command_from_build_config(
-            build_config,
-            image_name=image_name,
-            template_name=args.template,
-        )
-        return
-
-    if not args.config and not args.template:
+    elif not args.config and not args.template:
        name = prompt(
            "> Enter a name for your Llama Stack (e.g. my-local-stack): ",
            validator=Validator.from_callable(
@ -95,10 +89,10 @@ def run_stack_build_command(
        )

        image_type = prompt(
-            "> Enter the image type you want your Llama Stack to be built as (docker or conda or venv): ",
+            "> Enter the image type you want your Llama Stack to be built as (container or conda or venv): ",
            validator=Validator.from_callable(
-                lambda x: x in ["docker", "conda", "venv"],
-                error_message="Invalid image type, please enter conda or docker or venv",
+                lambda x: x in ["container", "conda", "venv"],
+                error_message="Invalid image type, please enter conda or container or venv",
            ),
            default="conda",
        )
@ -132,11 +126,7 @@ def run_stack_build_command(

        providers = dict()
        for api, providers_for_api in get_provider_registry().items():
-            available_providers = [
-                x
-                for x in providers_for_api.keys()
-                if x not in ("remote", "remote::sample")
-            ]
+            available_providers = [x for x in providers_for_api.keys() if x not in ("remote", "remote::sample")]
            api_provider = prompt(
                "> Enter provider for API {}: ".format(api.value),
                completer=WordCompleter(available_providers),
@ -159,9 +149,7 @@ def run_stack_build_command(
            description=description,
        )

-        build_config = BuildConfig(
-            image_type=image_type, distribution_spec=distribution_spec
-        )
+        build_config = BuildConfig(image_type=image_type, distribution_spec=distribution_spec)
    else:
        with open(args.config, "r") as f:
            try:
@ -180,8 +168,20 @@ def run_stack_build_command(
            )
            return

+    if args.print_deps_only:
+        print(f"# Dependencies for {args.template or args.config or image_name}")
+        normal_deps, special_deps = get_provider_dependencies(build_config.distribution_spec.providers)
+        normal_deps += SERVER_DEPENDENCIES
+        print(f"uv pip install {' '.join(normal_deps)}")
+        for special_dep in special_deps:
+            print(f"uv pip install {special_dep}")
+        return
+
    _run_stack_build_command_from_build_config(
-        build_config, image_name=image_name, config_path=args.config
+        build_config,
+        image_name=image_name,
+        config_path=args.config,
+        template_name=args.template,
    )


@ -195,9 +195,7 @@ def _generate_run_config(
    """
    apis = list(build_config.distribution_spec.providers.keys())
    run_config = StackRunConfig(
-        container_image=(
-            image_name if build_config.image_type == ImageType.container.value else None
-        ),
+        container_image=(image_name if build_config.image_type == ImageType.container.value else None),
        image_name=image_name,
        apis=apis,
        providers={},
@ -217,13 +215,9 @@ def _generate_run_config(
            if p.deprecation_error:
                raise InvalidProviderError(p.deprecation_error)

-            config_type = instantiate_class_type(
-                provider_registry[Api(api)][provider_type].config_class
-            )
+            config_type = instantiate_class_type(provider_registry[Api(api)][provider_type].config_class)
            if hasattr(config_type, "sample_run_config"):
-                config = config_type.sample_run_config(
-                    __distro_dir__=f"distributions/{image_name}"
-                )
+                config = config_type.sample_run_config(__distro_dir__=f"distributions/{image_name}")
            else:
                config = {}

@ -258,9 +252,7 @@ def _run_stack_build_command_from_build_config(
            image_name = f"distribution-{template_name}"
        else:
            if not image_name:
-                raise ValueError(
-                    "Please specify an image name when building a docker image without a template"
-                )
+                raise ValueError("Please specify an image name when building a container image without a template")
    elif build_config.image_type == ImageType.conda.value:
        if not image_name:
            raise ValueError("Please specify an image name when building a conda image")
@ -288,10 +280,7 @@ def _run_stack_build_command_from_build_config(

    if template_name:
        # copy run.yaml from template to build_dir instead of generating it again
-        template_path = (
-            importlib.resources.files("llama_stack")
-            / f"templates/{template_name}/run.yaml"
-        )
+        template_path = importlib.resources.files("llama_stack") / f"templates/{template_name}/run.yaml"
        with importlib.resources.as_file(template_path) as path:
            run_config_file = build_dir / f"{template_name}-run.yaml"
            shutil.copy(path, run_config_file)
--- a/llama_stack/cli/stack/build.py
+++ b/llama_stack/cli/stack/build.py
@ -63,10 +63,16 @@ environment is active, you must specify a name.
            ),
            default=None,
        )
+        self.parser.add_argument(
+            "--print-deps-only",
+            default=False,
+            action="store_true",
+            help="Print the dependencies for the stack only, without building the stack",
+        )

    def _run_stack_build_command(self, args: argparse.Namespace) -> None:
        # always keep implementation completely silo-ed away from CLI so CLI
        # can be fast to load and reduces dependencies
        from ._build import run_stack_build_command

-        return run_stack_build_command(self.parser, args)
+        return run_stack_build_command(args)
--- a/llama_stack/cli/stack/list_providers.py
+++ b/llama_stack/cli/stack/list_providers.py
@ -21,15 +21,19 @@ class StackListProviders(Subcommand):
        self._add_arguments()
        self.parser.set_defaults(func=self._run_providers_list_cmd)

-    def _add_arguments(self):
-        from llama_stack.distribution.datatypes import Api
+    @property
+    def providable_apis(self):
+        from llama_stack.distribution.distribution import providable_apis

-        api_values = [a.value for a in Api]
+        return [api.value for api in providable_apis()]
+
+    def _add_arguments(self):
        self.parser.add_argument(
            "api",
            type=str,
-            choices=api_values,
-            help="API to list providers for (one of: {})".format(api_values),
+            choices=self.providable_apis,
+            nargs="?",
+            help="API to list providers for. List all if not specified.",
        )

    def _run_providers_list_cmd(self, args: argparse.Namespace) -> None:
@ -37,20 +41,29 @@ class StackListProviders(Subcommand):
        from llama_stack.distribution.distribution import Api, get_provider_registry

        all_providers = get_provider_registry()
-        providers_for_api = all_providers[Api(args.api)]
+        if args.api:
+            providers = [(args.api, all_providers[Api(args.api)])]
+        else:
+            providers = [(k.value, prov) for k, prov in all_providers.items()]
+
+        providers = [p for api, p in providers if api in self.providable_apis]

        # eventually, this should query a registry at llama.meta.com/llamastack/distributions
        headers = [
+            "API Type",
            "Provider Type",
            "PIP Package Dependencies",
        ]

        rows = []
-        for spec in providers_for_api.values():
-            if spec.provider_type == "sample":
+
+        specs = [spec for p in providers for spec in p.values()]
+        for spec in specs:
+            if spec.is_sample:
                continue
            rows.append(
                [
+                    spec.api.value,
                    spec.provider_type,
                    ",".join(spec.pip_packages),
                ]
@ -59,4 +72,5 @@ class StackListProviders(Subcommand):
            rows,
            headers,
            separate_rows=True,
+            sort_by=(0, 1),
        )
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@ -55,6 +55,23 @@ class StackRun(Subcommand):
            default=[],
            metavar="KEY=VALUE",
        )
+        self.parser.add_argument(
+            "--tls-keyfile",
+            type=str,
+            help="Path to TLS key file for HTTPS",
+        )
+        self.parser.add_argument(
+            "--tls-certfile",
+            type=str,
+            help="Path to TLS certificate file for HTTPS",
+        )
+        self.parser.add_argument(
+            "--image-type",
+            type=str,
+            help="Image Type used during the build. This can be either conda or container or venv.",
+            choices=["conda", "container", "venv"],
+            default="conda",
+        )

    def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
        import importlib.resources
@ -82,31 +99,21 @@ class StackRun(Subcommand):

        if not config_file.exists() and not has_yaml_suffix:
            # check if this is a template
-            config_file = (
-                Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
-            )
+            config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
            if config_file.exists():
                template_name = args.config

        if not config_file.exists() and not has_yaml_suffix:
            # check if it's a build config saved to conda dir
-            config_file = Path(
-                BUILDS_BASE_DIR / ImageType.conda.value / f"{args.config}-run.yaml"
-            )
+            config_file = Path(BUILDS_BASE_DIR / ImageType.conda.value / f"{args.config}-run.yaml")

        if not config_file.exists() and not has_yaml_suffix:
            # check if it's a build config saved to container dir
-            config_file = Path(
-                BUILDS_BASE_DIR / ImageType.container.value / f"{args.config}-run.yaml"
-            )
+            config_file = Path(BUILDS_BASE_DIR / ImageType.container.value / f"{args.config}-run.yaml")

        if not config_file.exists() and not has_yaml_suffix:
            # check if it's a build config saved to ~/.llama dir
-            config_file = Path(
-                DISTRIBS_BASE_DIR
-                / f"llamastack-{args.config}"
-                / f"{args.config}-run.yaml"
-            )
+            config_file = Path(DISTRIBS_BASE_DIR / f"llamastack-{args.config}" / f"{args.config}-run.yaml")

        if not config_file.exists():
            self.parser.error(
@ -118,18 +125,11 @@ class StackRun(Subcommand):
        config_dict = yaml.safe_load(config_file.read_text())
        config = parse_and_maybe_upgrade_config(config_dict)

-        if config.container_image:
-            script = (
-                importlib.resources.files("llama_stack")
-                / "distribution/start_container.sh"
-            )
-            image_name = (
-                f"distribution-{template_name}"
-                if template_name
-                else config.container_image
-            )
+        if args.image_type == ImageType.container.value or config.container_image:
+            script = importlib.resources.files("llama_stack") / "distribution/start_container.sh"
+            image_name = f"distribution-{template_name}" if template_name else config.container_image
            run_args = [script, image_name]
-        else:
+        elif args.image_type == ImageType.conda.value:
            current_conda_env = os.environ.get("CONDA_DEFAULT_ENV")
            image_name = args.image_name or current_conda_env
            if not image_name:
@ -140,12 +140,12 @@ class StackRun(Subcommand):
                return

            def get_conda_prefix(env_name):
+                # Conda "base" environment does not end with "base" in the
+                # prefix, so should be handled separately.
+                if env_name == "base":
+                    return os.environ.get("CONDA_PREFIX")
                # Get conda environments info
-                conda_env_info = json.loads(
-                    subprocess.check_output(
-                        ["conda", "info", "--envs", "--json"]
-                    ).decode()
-                )
+                conda_env_info = json.loads(subprocess.check_output(["conda", "info", "--envs", "--json"]).decode())
                envs = conda_env_info["envs"]
                for envpath in envs:
                    if envpath.endswith(env_name):
@ -169,14 +169,20 @@ class StackRun(Subcommand):
                )
                return

-            script = (
-                importlib.resources.files("llama_stack")
-                / "distribution/start_conda_env.sh"
-            )
+            script = importlib.resources.files("llama_stack") / "distribution/start_conda_env.sh"
            run_args = [
                script,
                image_name,
            ]
+        else:
+            # else must be venv since that is the only valid option left.
+            current_venv = os.environ.get("VIRTUAL_ENV")
+            venv = args.image_name or current_venv
+            script = importlib.resources.files("llama_stack") / "distribution/start_venv.sh"
+            run_args = [
+                script,
+                venv,
+            ]

        run_args.extend([str(config_file), str(args.port)])
        if args.disable_ipv6:
@ -198,4 +204,7 @@ class StackRun(Subcommand):
                return
            run_args.extend(["--env", f"{key}={value}"])

+        if args.tls_keyfile and args.tls_certfile:
+            run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile])
+
        run_with_pty(run_args)
--- a/llama_stack/cli/stack/stack.py
+++ b/llama_stack/cli/stack/stack.py
@ -31,6 +31,8 @@ class StackParser(Subcommand):
            version=f"{version('llama-stack')}",
        )

+        self.parser.set_defaults(func=lambda args: self.parser.print_help())
+
        subparsers = self.parser.add_subparsers(title="stack_subcommands")

        # Add sub-commands
--- a/llama_stack/cli/table.py
+++ b/llama_stack/cli/table.py
@ -6,6 +6,7 @@

 import re
 import textwrap
+from typing import Iterable

 from termcolor import cprint

@ -22,11 +23,7 @@ def format_row(row, col_widths):
            if line.strip() == "":
                lines.append("")
            else:
-                lines.extend(
-                    textwrap.wrap(
-                        line, width, break_long_words=False, replace_whitespace=False
-                    )
-                )
+                lines.extend(textwrap.wrap(line, width, break_long_words=False, replace_whitespace=False))
        return lines

    wrapped = [wrap(item, width) for item, width in zip(row, col_widths)]
@ -43,11 +40,15 @@ def format_row(row, col_widths):
    return "\n".join(lines)


-def print_table(rows, headers=None, separate_rows: bool = False):
+def print_table(rows, headers=None, separate_rows: bool = False, sort_by: Iterable[int] = tuple()):
    def itemlen(item):
        return max([len(line) for line in strip_ansi_colors(item).split("\n")])

    rows = [[x or "" for x in row] for row in rows]
+
+    if sort_by:
+        rows.sort(key=lambda x: tuple(x[i] for i in sort_by))
+
    if not headers:
        col_widths = [max(itemlen(item) for item in col) for col in zip(*rows)]
    else:
--- a/llama_stack/cli/tests/test_stack_config.py
+++ b/llama_stack/cli/tests/test_stack_config.py
@ -8,6 +8,7 @@ from datetime import datetime

 import pytest
 import yaml
+
 from llama_stack.distribution.configure import (
    LLAMA_STACK_RUN_CONFIG_VERSION,
    parse_and_maybe_upgrade_config,
@ -41,9 +42,7 @@ def up_to_date_config():
            - provider_id: provider1
              provider_type: inline::meta-reference
              config: {{}}
-    """.format(
-            version=LLAMA_STACK_RUN_CONFIG_VERSION, built_at=datetime.now().isoformat()
-        )
+    """.format(version=LLAMA_STACK_RUN_CONFIG_VERSION, built_at=datetime.now().isoformat())
    )


@ -83,9 +82,7 @@ def old_config():
          telemetry:
            provider_type: noop
            config: {{}}
-    """.format(
-            built_at=datetime.now().isoformat()
-        )
+    """.format(built_at=datetime.now().isoformat())
    )


@ -108,10 +105,7 @@ def test_parse_and_maybe_upgrade_config_up_to_date(up_to_date_config):
 def test_parse_and_maybe_upgrade_config_old_format(old_config):
    result = parse_and_maybe_upgrade_config(old_config)
    assert result.version == LLAMA_STACK_RUN_CONFIG_VERSION
-    assert all(
-        api in result.providers
-        for api in ["inference", "safety", "memory", "telemetry"]
-    )
+    assert all(api in result.providers for api in ["inference", "safety", "memory", "telemetry"])
    safety_provider = result.providers["safety"][0]
    assert safety_provider.provider_type == "meta-reference"
    assert "llama_guard_shield" in safety_provider.config
--- a/Show more
+++ b/Show more