openapi

distro codegen
Merge branch 'main' into eval_api_final
2025-03-26 12:32:24 -07:00 · 2025-03-26 12:31:08 -07:00 · 2025-03-26 12:29:45 -07:00 · 2025-03-23 16:51:17 -07:00 · 2025-03-23 16:42:50 -07:00 · 2025-03-23 16:32:06 -07:00
932 changed files with 81352 additions and 291202 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -1,6 +0,0 @@
 [run]
 omit =
    */tests/*
    */llama_stack/providers/*
    */llama_stack/templates/*
    .venv/*
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@
 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning
+* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan @SLR722
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -1,8 +1,10 @@
 # What does this PR do?
-<!-- Provide a short summary of what this PR does and why. Link to relevant issues if applicable. -->
+[Provide a short summary of what this PR does and why. Link to relevant issues if applicable.]
-<!-- If resolving an issue, uncomment and update the line below -->
+[//]: # (If resolving an issue, uncomment and update the line below)
-<!-- Closes #[issue-number] -->
+[//]: # (Closes #[issue-number])
 ## Test Plan
-<!-- Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.* -->
+[Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.*]
 [//]: # (## Documentation)
--- a/.github/TRIAGERS.md
+++ b/.github/TRIAGERS.md
@ -1,2 +1,2 @@
 # This file documents Triage members in the Llama Stack community
- @bbrowning @booxter @franciscojavierarceo @leseb
+@franciscojavierarceo @leseb
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@ -1,26 +0,0 @@
 name: Setup Ollama
 description: Start Ollama and cache model
 inputs:
  models:
    description: Comma-separated list of models to pull
    default: "llama3.2:3b-instruct-fp16,all-minilm:latest"
 runs:
  using: "composite"
  steps:
    - name: Install and start Ollama
      shell: bash
      run: |
        # the ollama installer also starts the ollama service
        curl -fsSL https://ollama.com/install.sh | sh
    # Do NOT cache models - pulling the cache is actually slower than just pulling the model.
    # It takes ~45 seconds to pull the models from the cache and unpack it, but only 30 seconds to
    # pull them directly.
    # Maybe this is because the cache is being pulled at the same time by all the matrix jobs?
    - name: Pull requested models
      if: inputs.models != ''
      shell: bash
      run: |
        for model in $(echo "${{ inputs.models }}" | tr ',' ' '); do
          ollama pull "$model"
        done
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -1,22 +0,0 @@
 name: Setup runner
 description: Prepare a runner for the tests (install uv, python, project dependencies, etc.)
 runs:
  using: "composite"
  steps:
    - name: Install uv
      uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
      with:
        python-version: "3.10"
        activate-environment: true
        version: 0.7.6
    - name: Install dependencies
      shell: bash
      run: |
        uv sync --all-groups
        uv pip install ollama faiss-cpu
        # always test against the latest version of the client
        # TODO: this is not necessarily a good idea. we need to test against both published and latest
        # to find out backwards compatibility issues.
        uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
        uv pip install -e .
--- a/.github/workflows/Dockerfile
+++ b/.github/workflows/Dockerfile
@ -1 +0,0 @@
 FROM localhost:5000/distribution-kvant:dev
--- a/.github/workflows_upstream/changelog.yml
+++ b/.github/workflows_upstream/changelog.yml
@ -15,13 +15,13 @@ jobs:
      pull-requests: write  # for peter-evans/create-pull-request to create a PR
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@v4
        with:
          ref: main
          fetch-depth: 0
      - run: |
          python ./scripts/gen-changelog.py
-      - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
+      - uses: peter-evans/create-pull-request@v7
        with:
          title: 'docs: update CHANGELOG.md for ${{ github.ref_name }}'
          commit-message: 'docs: update CHANGELOG.md for ${{ github.ref_name }}'
--- a/.github/workflows/ci-playground.yaml
+++ b/.github/workflows/ci-playground.yaml
@ -1,73 +0,0 @@
 name: Build and Push playground container
 run-name: Build and Push playground container
 on:
  workflow_dispatch:
  #schedule:
  #  - cron: "0 10 * * *"
  push:
    branches:
      - main
      - kvant
    tags:
      - 'v*'
  pull_request:
    branches:
      - main
      - kvant
 env:
  IMAGE: git.kvant.cloud/${{github.repository}}-playground
 jobs:
  build-playground:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Set current time
        uses: https://github.com/gerred/actions/current-time@master
        id: current_time
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to git.kvant.cloud registry
        uses: docker/login-action@v3
        with:
          registry: git.kvant.cloud
          username: ${{ vars.ORG_PACKAGE_WRITER_USERNAME }}
          password: ${{ secrets.ORG_PACKAGE_WRITER_TOKEN }}
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
        with:
          # list of Docker images to use as base name for tags
          images: |
            ${{env.IMAGE}}
          # generate Docker tags based on the following events/attributes
          tags: |
            type=schedule
            type=ref,event=branch
            type=ref,event=pr
            type=ref,event=tag
            type=semver,pattern={{version}}
      - name: Build and push to gitea registry
        uses: docker/build-push-action@v6
        with:
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
          context: .
          file: llama_stack/distribution/ui/Containerfile
          provenance: mode=max
          sbom: true
          build-args: |
            BUILD_DATE=${{ steps.current_time.outputs.time }}
          cache-from: |
            type=registry,ref=${{ env.IMAGE }}:buildcache
            type=registry,ref=${{ env.IMAGE }}:${{ github.ref_name }}
            type=registry,ref=${{ env.IMAGE }}:main
          cache-to: type=registry,ref=${{ env.IMAGE }}:buildcache,mode=max,image-manifest=true
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@ -1,98 +0,0 @@
 name: Build and Push container
 run-name: Build and Push container
 on:
  workflow_dispatch:
  #schedule:
  #  - cron: "0 10 * * *"
  push:
    branches:
      - main
      - kvant
    tags:
      - 'v*'
  pull_request:
    branches:
      - main
      - kvant
 env:
  IMAGE: git.kvant.cloud/${{github.repository}}
 jobs:
  build:
    runs-on: ubuntu-latest
    services:
      registry:
        image: registry:2
        ports:
          - 5000:5000
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Set current time
        uses: https://github.com/gerred/actions/current-time@master
        id: current_time
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
        with:
          driver-opts: network=host
      - name: Login to git.kvant.cloud registry
        uses: docker/login-action@v3
        with:
          registry: git.kvant.cloud
          username: ${{ vars.ORG_PACKAGE_WRITER_USERNAME }}
          password: ${{ secrets.ORG_PACKAGE_WRITER_TOKEN }}
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
        with:
          # list of Docker images to use as base name for tags
          images: |
            ${{env.IMAGE}}
          # generate Docker tags based on the following events/attributes
          tags: |
            type=schedule
            type=ref,event=branch
            type=ref,event=pr
            type=ref,event=tag
            type=semver,pattern={{version}}
      - name: Install uv
        uses: https://github.com/astral-sh/setup-uv@v5
        with:
          # Install a specific version of uv.
          version: "0.7.8"
      - name: Build
        env:
          USE_COPY_NOT_MOUNT: true
          LLAMA_STACK_DIR: .
        run: |
          uvx --from . llama stack build --template kvant --image-type container
          # docker tag distribution-kvant:dev ${{env.IMAGE}}:kvant
          # docker push ${{env.IMAGE}}:kvant
          docker tag distribution-kvant:dev localhost:5000/distribution-kvant:dev
          docker push localhost:5000/distribution-kvant:dev
      - name: Build and push to gitea registry
        uses: docker/build-push-action@v6
        with:
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
          context: .github/workflows
          provenance: mode=max
          sbom: true
          build-args: |
            BUILD_DATE=${{ steps.current_time.outputs.time }}
          cache-from: |
            type=registry,ref=${{ env.IMAGE }}:buildcache
            type=registry,ref=${{ env.IMAGE }}:${{ github.ref_name }}
            type=registry,ref=${{ env.IMAGE }}:main
          cache-to: type=registry,ref=${{ env.IMAGE }}:buildcache,mode=max,image-manifest=true
--- a/.github/workflows_upstream/gha_workflow_llama_stack_tests.yml
+++ b/.github/workflows_upstream/gha_workflow_llama_stack_tests.yml
@ -140,7 +140,7 @@ jobs:
      #######################
      - name: "Checkout 'meta-llama/llama-stack' repository"
        id: checkout_repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@v4
        with:
          ref: ${{ inputs.branch }}
@ -302,7 +302,7 @@ jobs:
      - name: "PR - Test Summary"
        id: pr_test_summary_create
        if: github.event_name == 'pull_request_target'
-        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
+        uses: test-summary/action@v2
        with:
          paths: "${{ github.workspace }}/merged-test-results.xml"
          output: test-summary.md
@ -310,7 +310,7 @@ jobs:
      - name: "PR - Upload Test Summary"
        id: pr_test_summary_upload
        if: github.event_name == 'pull_request_target'
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@v4
        with:
          name: test-summary
          path: test-summary.md
@ -320,7 +320,7 @@ jobs:
      - name: "PR - Update comment"
        id: pr_update_comment
        if: github.event_name == 'pull_request_target'
-        uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b # v3.0.1
+        uses: thollander/actions-comment-pull-request@v3
        with:
          filePath: test-summary.md
@ -350,6 +350,6 @@ jobs:
      - name: "Manual - Test Summary"
        id: manual_test_summary
        if: always() && github.event_name == 'workflow_dispatch'
-        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
+        uses: test-summary/action@v2
        with:
          paths: "${{ github.workspace }}/merged-test-results.xml"
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -0,0 +1,101 @@
 name: Integration Tests
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
    paths:
      - 'distributions/**'
      - 'llama_stack/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/integration-tests.yml' # This workflow
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  test-matrix:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        # Listing tests manually since some of them currently fail
        # TODO: generate matrix list from tests/integration when fixed
        test-type: [inference, datasets, inspect, scoring, post_training, providers]
      fail-fast: false # we want to run all tests regardless of failure
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Install uv
        uses: astral-sh/setup-uv@v5
        with:
          python-version: "3.10"
      - name: Install Ollama
        run: |
          curl -fsSL https://ollama.com/install.sh | sh
      - name: Pull Ollama image
        run: |
          ollama pull llama3.2:3b-instruct-fp16
      - name: Start Ollama in background
        run: |
          nohup ollama run llama3.2:3b-instruct-fp16 > ollama.log 2>&1 &
      - name: Set Up Environment and Install Dependencies
        run: |
          uv sync --extra dev --extra test
          uv pip install ollama faiss-cpu
          # always test against the latest version of the client
          uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
          uv pip install -e .
          llama stack build --template ollama --image-type venv
      - name: Wait for Ollama to start
        run: |
          echo "Waiting for Ollama..."
          for i in {1..30}; do
            if curl -s http://localhost:11434 | grep -q "Ollama is running"; then
              echo "Ollama is running!"
              exit 0
            fi
            sleep 1
          done
          echo "Ollama failed to start"
          ollama ps
          ollama.log
          exit 1
      - name: Start Llama Stack server in background
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        run: |
          source .venv/bin/activate
          nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv > server.log 2>&1 &
      - name: Wait for Llama Stack server to be ready
        run: |
          echo "Waiting for Llama Stack server..."
          for i in {1..30}; do
            if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
              echo "Llama Stack server is up!"
              exit 0
            fi
            sleep 1
          done
          echo "Llama Stack server failed to start"
          cat server.log
          exit 1
      - name: Run Integration Tests
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        run: |
          uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=ollama --text-model="meta-llama/Llama-3.2-3B-Instruct" --embedding-model=all-MiniLM-L6-v2
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -0,0 +1,33 @@
 name: Pre-commit
 on:
  pull_request:
  push:
    branches: [main]
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
          cache: pip
          cache-dependency-path: |
            **/requirements*.txt
            .pre-commit-config.yaml
      - uses: pre-commit/action@v3.0.1
      - name: Verify if there are any diff files after pre-commit
        run: |
          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -0,0 +1,83 @@
 name: Test Llama Stack Build
 on:
  push:
    branches:
      - main
    paths:
      - 'llama_stack/cli/stack/build.py'
      - 'llama_stack/cli/stack/_build.py'
      - 'llama_stack/distribution/build.*'
      - 'llama_stack/distribution/*.sh'
      - '.github/workflows/providers-build.yml'
  pull_request:
    paths:
      - 'llama_stack/cli/stack/build.py'
      - 'llama_stack/cli/stack/_build.py'
      - 'llama_stack/distribution/build.*'
      - 'llama_stack/distribution/*.sh'
      - '.github/workflows/providers-build.yml'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  generate-matrix:
    runs-on: ubuntu-latest
    outputs:
      templates: ${{ steps.set-matrix.outputs.templates }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Generate Template List
        id: set-matrix
        run: |
          templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
          echo "templates=$templates" >> "$GITHUB_OUTPUT"
  build:
    needs: generate-matrix
    runs-on: ubuntu-latest
    strategy:
      matrix:
        template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
        image-type: [venv, container]
      fail-fast: false # We want to run all jobs even if some fail
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.10'
      - name: Install uv
        uses: astral-sh/setup-uv@v5
        with:
          python-version: "3.10"
      - name: Install LlamaStack
        run: |
          uv venv
          source .venv/bin/activate
          uv pip install -e .
      - name: Print build dependencies
        run: |
          uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
      - name: Run Llama Stack Build
        run: |
          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test
      - name: Print dependencies in the image
        if: matrix.image-type == 'venv'
        run: |
          source test/bin/activate
          uv pip list
--- a/.github/workflows_upstream/semantic-pr.yml
+++ b/.github/workflows_upstream/semantic-pr.yml
@ -20,6 +20,6 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Check PR Title's semantic conformance
-        uses: amannn/action-semantic-pull-request@0723387faaf9b38adef4775cd42cfd5155ed6017 # v5.5.3
+        uses: amannn/action-semantic-pull-request@v5
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows_upstream/stale_bot.yml
+++ b/.github/workflows_upstream/stale_bot.yml
@ -22,7 +22,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Stale Action
-        uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
+        uses: actions/stale@v9
        with:
          stale-issue-label: 'stale'
          stale-issue-message: >
--- a/.github/workflows_upstream/tests.yml
+++ b/.github/workflows_upstream/tests.yml
@ -20,7 +20,7 @@ jobs:
      matrix:
        provider: [fireworks, together]
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@v4
        with:
          ref: ${{ github.event.inputs.commit_sha }}
--- a/.github/workflows_upstream/unit-tests.yml
+++ b/.github/workflows_upstream/unit-tests.yml
@ -6,6 +6,7 @@ on:
  pull_request:
    branches: [ main ]
    paths:
      - 'distributions/**'
      - 'llama_stack/**'
      - 'tests/unit/**'
      - 'uv.lock'
@ -30,11 +31,17 @@ jobs:
          - "3.12"
          - "3.13"
    steps:
-      - name: Checkout repository
+      - uses: actions/checkout@v4
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Install dependencies
+      - name: Set up Python ${{ matrix.python }}
-        uses: ./.github/actions/setup-runner
+        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python }}
      - uses: astral-sh/setup-uv@v5
        with:
          python-version: ${{ matrix.python }}
          enable-cache: false
      - name: Run unit tests
        run: |
@ -42,7 +49,7 @@ jobs:
      - name: Upload test results
        if: always()
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@v4
        with:
          name: test-results-${{ matrix.python }}
          path: |
--- a/.github/workflows_upstream/update-readthedocs.yml
+++ b/.github/workflows_upstream/update-readthedocs.yml
@ -14,8 +14,6 @@ on:
      - 'docs/**'
      - 'pyproject.toml'
      - '.github/workflows/update-readthedocs.yml'
    tags:
      - '*'
  pull_request:
    branches:
      - main
@ -35,10 +33,18 @@ jobs:
      TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@v4
-      - name: Install dependencies
+      - name: Set up Python
-        uses: ./.github/actions/setup-runner
+        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - name: Install the latest version of uv
        uses: astral-sh/setup-uv@v5
      - name: Sync with uv
        run: uv sync --extra docs
      - name: Build HTML
        run: |
@ -55,10 +61,7 @@ jobs:
          response=$(curl -X POST \
            -H "Content-Type: application/json" \
-            -d "{
+            -d "{\"token\": \"$TOKEN\"}" \
              \"token\": \"$TOKEN\",
              \"version\": \"$GITHUB_REF_NAME\"
            }" \
            https://readthedocs.org/api/v2/webhook/llama-stack/289768/)
          echo "Response: $response"
--- a/.github/workflows_upstream/install-script-ci.yml
+++ b/.github/workflows_upstream/install-script-ci.yml
@ -1,26 +0,0 @@
 name: Installer CI
 on:
  pull_request:
    paths:
      - 'install.sh'
  push:
    paths:
      - 'install.sh'
  schedule:
    - cron: '0 2 * * *'  # every day at 02:00 UTC
 jobs:
  lint:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
      - name: Run ShellCheck on install.sh
        run: shellcheck install.sh
  smoke-test:
    needs: lint
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
      - name: Run installer end-to-end
        run: ./install.sh
--- a/.github/workflows_upstream/integration-auth-tests.yml
+++ b/.github/workflows_upstream/integration-auth-tests.yml
@ -1,132 +0,0 @@
 name: Integration Auth Tests
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
    paths:
      - 'distributions/**'
      - 'llama_stack/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/integration-auth-tests.yml' # This workflow
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  test-matrix:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        auth-provider: [oauth2_token]
      fail-fast: false # we want to run all tests regardless of failure
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Build Llama Stack
        run: |
          llama stack build --template ollama --image-type venv
      - name: Install minikube
        if: ${{ matrix.auth-provider == 'kubernetes' }}
        uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # v0.0.19
      - name: Start minikube
        if: ${{ matrix.auth-provider == 'oauth2_token' }}
        run: |
          minikube start
          kubectl get pods -A
      - name: Configure Kube Auth
        if: ${{ matrix.auth-provider == 'oauth2_token' }}
        run: |
          kubectl create namespace llama-stack
          kubectl create serviceaccount llama-stack-auth -n llama-stack
          kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --serviceaccount=llama-stack:llama-stack-auth -n llama-stack
          kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
          cat <<EOF | kubectl apply -f -
          apiVersion: rbac.authorization.k8s.io/v1
          kind: ClusterRole
          metadata:
            name: allow-anonymous-openid
          rules:
          - nonResourceURLs: ["/openid/v1/jwks"]
            verbs: ["get"]
          ---
          apiVersion: rbac.authorization.k8s.io/v1
          kind: ClusterRoleBinding
          metadata:
            name: allow-anonymous-openid
          roleRef:
            apiGroup: rbac.authorization.k8s.io
            kind: ClusterRole
            name: allow-anonymous-openid
          subjects:
          - kind: User
            name: system:anonymous
            apiGroup: rbac.authorization.k8s.io
          EOF
      - name: Set Kubernetes Config
        if: ${{ matrix.auth-provider == 'oauth2_token' }}
        run: |
          echo "KUBERNETES_API_SERVER_URL=$(kubectl get --raw /.well-known/openid-configuration| jq -r .jwks_uri)" >> $GITHUB_ENV
          echo "KUBERNETES_CA_CERT_PATH=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}')" >> $GITHUB_ENV
          echo "KUBERNETES_ISSUER=$(kubectl get --raw /.well-known/openid-configuration| jq -r .issuer)" >> $GITHUB_ENV
          echo "KUBERNETES_AUDIENCE=$(kubectl create token llama-stack-auth -n llama-stack --duration=1h | cut -d. -f2 | base64 -d | jq -r '.aud[0]')" >> $GITHUB_ENV
      - name: Set Kube Auth Config and run server
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        if: ${{ matrix.auth-provider == 'oauth2_token' }}
        run: |
          run_dir=$(mktemp -d)
          cat <<'EOF' > $run_dir/run.yaml
          version: '2'
          image_name: kube
          apis: []
          providers: {}
          server:
            port: 8321
          EOF
          yq eval '.server.auth = {"provider_type": "${{ matrix.auth-provider }}"}' -i $run_dir/run.yaml
          yq eval '.server.auth.config = {"tls_cafile": "${{ env.KUBERNETES_CA_CERT_PATH }}", "issuer": "${{ env.KUBERNETES_ISSUER }}", "audience": "${{ env.KUBERNETES_AUDIENCE }}"}' -i $run_dir/run.yaml
          yq eval '.server.auth.config.jwks = {"uri": "${{ env.KUBERNETES_API_SERVER_URL }}"}' -i $run_dir/run.yaml
          cat $run_dir/run.yaml
          nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &
      - name: Wait for Llama Stack server to be ready
        run: |
          echo "Waiting for Llama Stack server..."
          for i in {1..30}; do
            if curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://localhost:8321/v1/health | grep -q "OK"; then
              echo "Llama Stack server is up!"
              if grep -q "Enabling authentication with provider: ${{ matrix.auth-provider }}" server.log; then
                echo "Llama Stack server is configured to use ${{ matrix.auth-provider }} auth"
                exit 0
              else
                echo "Llama Stack server is not configured to use ${{ matrix.auth-provider }} auth"
                cat server.log
                exit 1
              fi
            fi
            sleep 1
          done
          echo "Llama Stack server failed to start"
          cat server.log
          exit 1
      - name: Test auth
        run: |
          curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers|jq
--- a/.github/workflows_upstream/integration-tests.yml
+++ b/.github/workflows_upstream/integration-tests.yml
@ -1,116 +0,0 @@
 name: Integration Tests
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/integration-tests.yml' # This workflow
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  test-matrix:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        # Listing tests manually since some of them currently fail
        # TODO: generate matrix list from tests/integration when fixed
        test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime]
        client-type: [library, http]
      fail-fast: false # we want to run all tests regardless of failure
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Setup ollama
        uses: ./.github/actions/setup-ollama
      - name: Build Llama Stack
        run: |
          llama stack build --template ollama --image-type venv
      - name: Start Llama Stack server in background
        if: matrix.client-type == 'http'
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        run: |
          LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv &
      - name: Wait for Llama Stack server to be ready
        if: matrix.client-type == 'http'
        run: |
          echo "Waiting for Llama Stack server..."
          for i in {1..30}; do
            if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
              echo "Llama Stack server is up!"
              exit 0
            fi
            sleep 1
          done
          echo "Llama Stack server failed to start"
          cat server.log
          exit 1
      - name: Verify Ollama status is OK
        if: matrix.client-type == 'http'
        run: |
          echo "Verifying Ollama status..."
          ollama_status=$(curl -s -L http://127.0.0.1:8321/v1/providers/ollama|jq --raw-output .health.status)
          echo "Ollama status: $ollama_status"
          if [ "$ollama_status" != "OK" ]; then
            echo "Ollama health check failed"
            exit 1
          fi
      - name: Check Storage and Memory Available Before Tests
        if: ${{ always() }}
        run: |
          free -h
          df -h
      - name: Run Integration Tests
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        run: |
          if [ "${{ matrix.client-type }}" == "library" ]; then
            stack_config="ollama"
          else
            stack_config="http://localhost:8321"
          fi
          uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
            -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
            --text-model="meta-llama/Llama-3.2-3B-Instruct" \
            --embedding-model=all-MiniLM-L6-v2
      - name: Check Storage and Memory Available After Tests
        if: ${{ always() }}
        run: |
          free -h
          df -h
      - name: Write ollama logs to file
        if: ${{ always() }}
        run: |
          sudo journalctl -u ollama.service > ollama.log
      - name: Upload all logs to artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}
          path: |
            *.log
          retention-days: 1
--- a/.github/workflows_upstream/pre-commit.yml
+++ b/.github/workflows_upstream/pre-commit.yml
@ -1,45 +0,0 @@
 name: Pre-commit
 on:
  pull_request:
  push:
    branches: [main]
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Set up Python
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
        with:
          python-version: '3.11'
          cache: pip
          cache-dependency-path: |
            **/requirements*.txt
            .pre-commit-config.yaml
      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
        env:
          SKIP: no-commit-to-branch
          RUFF_OUTPUT_FORMAT: github
      - name: Verify if there are any diff files after pre-commit
        run: |
          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
      - name: Verify if there are any new files after pre-commit
        run: |
          unstaged_files=$(git ls-files --others --exclude-standard)
          if [ -n "$unstaged_files" ]; then
            echo "There are uncommitted new files, run pre-commit locally and commit again"
            echo "$unstaged_files"
            exit 1
          fi
--- a/.github/workflows_upstream/providers-build.yml
+++ b/.github/workflows_upstream/providers-build.yml
@ -1,147 +0,0 @@
 name: Test Llama Stack Build
 on:
  push:
    branches:
      - main
    paths:
      - 'llama_stack/cli/stack/build.py'
      - 'llama_stack/cli/stack/_build.py'
      - 'llama_stack/distribution/build.*'
      - 'llama_stack/distribution/*.sh'
      - '.github/workflows/providers-build.yml'
  pull_request:
    paths:
      - 'llama_stack/cli/stack/build.py'
      - 'llama_stack/cli/stack/_build.py'
      - 'llama_stack/distribution/build.*'
      - 'llama_stack/distribution/*.sh'
      - '.github/workflows/providers-build.yml'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  generate-matrix:
    runs-on: ubuntu-latest
    outputs:
      templates: ${{ steps.set-matrix.outputs.templates }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Generate Template List
        id: set-matrix
        run: |
          templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
          echo "templates=$templates" >> "$GITHUB_OUTPUT"
  build:
    needs: generate-matrix
    runs-on: ubuntu-latest
    strategy:
      matrix:
        template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
        image-type: [venv, container]
      fail-fast: false # We want to run all jobs even if some fail
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Print build dependencies
        run: |
          uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
      - name: Run Llama Stack Build
        run: |
          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test
      - name: Print dependencies in the image
        if: matrix.image-type == 'venv'
        run: |
          uv pip list
  build-single-provider:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Build a single provider
        run: |
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --image-type venv --image-name test --providers inference=remote::ollama
  build-custom-container-distribution:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Build a single provider
        run: |
          yq -i '.image_type = "container"' llama_stack/templates/starter/build.yaml
          yq -i '.image_name = "test"' llama_stack/templates/starter/build.yaml
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/starter/build.yaml
      - name: Inspect the container image entrypoint
        run: |
          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
  build-ubi9-container-distribution:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Pin template to UBI9 base
        run: |
          yq -i '
            .image_type    = "container" |
            .image_name    = "ubi9-test" |
            .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
          ' llama_stack/templates/starter/build.yaml
      - name: Build dev container (UBI9)
        env:
          USE_COPY_NOT_MOUNT: "true"
          LLAMA_STACK_DIR: "."
        run: |
          uv run llama stack build --config llama_stack/templates/starter/build.yaml
      - name: Inspect UBI9 image
        run: |
          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
          echo "Checking /etc/os-release in $IMAGE_ID"
          docker run --rm --entrypoint sh "$IMAGE_ID" -c \
              'source /etc/os-release && echo "$ID"' \
              | grep -qE '^(rhel|ubi)$' \
              || { echo "Base image is not UBI 9!"; exit 1; }
--- a/.github/workflows_upstream/test-external-providers.yml
+++ b/.github/workflows_upstream/test-external-providers.yml
@ -1,71 +0,0 @@
 name: Test External Providers
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/test-external-providers.yml' # This workflow
 jobs:
  test-external-providers:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        image-type: [venv]
        # We don't do container yet, it's tricky to install a package from the host into the
        # container and point 'uv pip install' to the correct path...
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Apply image type to config file
        run: |
          yq -i '.image_type = "${{ matrix.image-type }}"' tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
          cat tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
      - name: Setup directory for Ollama custom provider
        run: |
          mkdir -p tests/external-provider/llama-stack-provider-ollama/src/
          cp -a llama_stack/providers/remote/inference/ollama/ tests/external-provider/llama-stack-provider-ollama/src/llama_stack_provider_ollama
      - name: Create provider configuration
        run: |
          mkdir -p /home/runner/.llama/providers.d/remote/inference
          cp tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml
      - name: Build distro from config file
        run: |
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        run: |
          uv run pip list
          nohup uv run --active llama stack run tests/external-provider/llama-stack-provider-ollama/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
      - name: Wait for Llama Stack server to be ready
        run: |
          for i in {1..30}; do
            if ! grep -q "remote::custom_ollama from /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml" server.log; then
              echo "Waiting for Llama Stack server to load the provider..."
              sleep 1
            else
              echo "Provider loaded"
              exit 0
            fi
          done
          echo "Provider failed to load"
          cat server.log
          exit 1
--- a/.gitignore
+++ b/.gitignore
@ -6,7 +6,6 @@ dev_requirements.txt
 build
 .DS_Store
 llama_stack/configs/*
 .cursor/
 xcuserdata/
 *.hmap
 .DS_Store
@ -24,4 +23,3 @@ venv/
 pytest-report.xml
 .coverage
 .python-version
 data
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -15,18 +15,6 @@ repos:
        args: ['--maxkb=1000']
    -   id: end-of-file-fixer
        exclude: '^(.*\.svg)$'
    -   id: no-commit-to-branch
    -   id: check-yaml
        args: ["--unsafe"]
    -   id: detect-private-key
    -   id: requirements-txt-fixer
    -   id: mixed-line-ending
        args: [--fix=lf] # Forces to replace line ending by LF (line feed)
    -   id: check-executables-have-shebangs
    -   id: check-json
    -   id: check-shebang-scripts-are-executable
    -   id: check-symlinks
    -   id: check-toml
 -   repo: https://github.com/Lucas-C/pre-commit-hooks
    rev: v1.5.4
@ -53,7 +41,7 @@ repos:
        - black==24.3.0
 -   repo: https://github.com/astral-sh/uv-pre-commit
-    rev: 0.7.8
+    rev: 0.6.3
    hooks:
    -   id: uv-lock
    -   id: uv-export
@ -61,7 +49,6 @@ repos:
            "--frozen",
            "--no-hashes",
            "--no-emit-project",
            "--no-default-groups",
            "--output-file=requirements.txt"
        ]
@ -89,29 +76,24 @@ repos:
      - id: distro-codegen
        name: Distribution Template Codegen
        additional_dependencies:
-          - uv==0.7.8
+          - uv==0.6.0
-        entry: uv run --group codegen ./scripts/distro_codegen.py
+        entry: uv run --extra codegen ./scripts/distro_codegen.py
        language: python
        pass_filenames: false
        require_serial: true
        files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
 -   repo: local
    hooks:
      - id: openapi-codegen
        name: API Spec Codegen
        additional_dependencies:
-          - uv==0.7.8
+          - uv==0.6.2
-        entry: sh -c 'uv run ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
+        entry: sh -c 'uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
        language: python
        pass_filenames: false
        require_serial: true
        files: ^llama_stack/apis/|^docs/openapi_generator/
      - id: check-workflows-use-hashes
        name: Check GitHub Actions use SHA-pinned actions
        entry: ./scripts/check-workflows-use-hashes.sh
        language: system
        pass_filenames: false
        require_serial: true
        always_run: true
        files: ^\.github/workflows/.*\.ya?ml$
 ci:
    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -5,21 +5,28 @@
 # Required
 version: 2
 # Build documentation in the "docs/" directory with Sphinx
 sphinx:
  configuration: docs/source/conf.py
 # Set the OS, Python version and other tools you might need
 build:
  os: ubuntu-22.04
  tools:
    python: "3.12"
-  jobs:
+    # You can also specify other tool versions:
-    pre_create_environment:
+    # nodejs: "19"
-      - asdf plugin add uv
+    # rust: "1.64"
-      - asdf install uv latest
+    # golang: "1.19"
-      - asdf global uv latest
+
-    create_environment:
+# Build documentation in the "docs/" directory with Sphinx
-      - uv venv "${READTHEDOCS_VIRTUALENV_PATH}"
+sphinx:
-    install:
+  configuration: docs/source/conf.py
-      - UV_PROJECT_ENVIRONMENT="${READTHEDOCS_VIRTUALENV_PATH}" uv sync --frozen --group docs
+
 # Optionally build your docs in additional formats such as PDF and ePub
 # formats:
 #    - pdf
 #    - epub
 # Optional but recommended, declare the Python requirements required
 # to build your documentation
 # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
 python:
   install:
   - requirements: docs/requirements.txt
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,148 +1,41 @@
 # Changelog
 # v0.2.7
 Published on: 2025-05-16T20:38:10Z
 ## Highlights
 This is a small update. But a couple highlights:
 * feat: function tools in OpenAI Responses by @bbrowning in https://github.com/meta-llama/llama-stack/pull/2094, getting closer to ready. Streaming is the next missing piece.
 * feat: Adding support for customizing chunk context in RAG insertion and querying by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/2134
 * feat: scaffolding for Llama Stack UI by @ehhuang in https://github.com/meta-llama/llama-stack/pull/2149, more to come in the coming releases.
 ---
 # v0.2.6
 Published on: 2025-05-12T18:06:52Z
 ---
 # v0.2.5
 Published on: 2025-05-04T20:16:49Z
 ---
 # v0.2.4
 Published on: 2025-04-29T17:26:01Z
 ## Highlights
 * One-liner to install and run Llama Stack yay! by @reluctantfuturist in https://github.com/meta-llama/llama-stack/pull/1383
 * support for NVIDIA NeMo datastore by @raspawar in https://github.com/meta-llama/llama-stack/pull/1852
 * (yuge!) Kubernetes authentication by @leseb in https://github.com/meta-llama/llama-stack/pull/1778
 * (yuge!) OpenAI Responses API by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1989
 * add api.llama provider, llama-guard-4 model by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2058
 ---
 # v0.2.3
 Published on: 2025-04-25T22:46:21Z
 ## Highlights
 * OpenAI compatible inference endpoints and client-SDK support. `client.chat.completions.create()` now works.
 * significant improvements and functionality added to the nVIDIA distribution
 * many improvements to the test verification suite.
 * new inference providers: Ramalama, IBM WatsonX
 * many improvements to the Playground UI
 ---
 # v0.2.2
 Published on: 2025-04-13T01:19:49Z
 ## Main changes
 - Bring Your Own Provider (@leseb) - use out-of-tree provider code to execute the distribution server
 - OpenAI compatible inference API in progress (@bbrowning)
 - Provider verifications (@ehhuang)
 - Many updates and fixes to playground
 - Several llama4 related fixes
 ---
 # v0.2.1
 Published on: 2025-04-05T23:13:00Z
 ---
 # v0.2.0
 Published on: 2025-04-05T19:04:29Z
 ## Llama 4 Support
 Checkout more at https://www.llama.com
 ---
 # v0.1.9
 Published on: 2025-03-29T00:52:23Z
 ### Build and Test Agents
 * Agents: Entire document context with attachments
 * RAG: Documentation with sqlite-vec faiss comparison
 * Getting started: Fixes to getting started notebook.
 ### Agent Evals and Model Customization
 * (**New**) Post-training: Add nemo customizer
 ### Better Engineering
 * Moved sqlite-vec to non-blocking calls
 * Don't return a payload on file delete
 ---
 # v0.1.8
 Published on: 2025-03-24T01:28:50Z
-# v0.1.8 Release Notes
+# v0.1.8 Release Notes
-
+
-### Build and Test Agents
+### Build and Test Agents
-* Safety: Integrated NVIDIA as a safety provider.
+* Safety: Integrated NVIDIA as a safety provider.
-* VectorDB: Added Qdrant as an inline provider.
+* VectorDB: Added Qdrant as an inline provider.
-* Agents: Added support for multiple tool groups in agents.
+* Agents: Added support for multiple tool groups in agents.
-* Agents: Simplified imports for Agents in client package
+* Agents: Simplified imports for Agents in client package
-
+
-
+
-### Agent Evals and Model Customization
+### Agent Evals and Model Customization
-* Introduced DocVQA and IfEval benchmarks.
+* Introduced DocVQA and IfEval benchmarks.
-
+
-### Deploying and Monitoring Agents
+### Deploying and Monitoring Agents
-* Introduced a Containerfile and image workflow for the Playground.
+* Introduced a Containerfile and image workflow for the Playground.
-* Implemented support for Bearer (API Key) authentication.
+* Implemented support for Bearer (API Key) authentication.
-* Added attribute-based access control for resources.
+* Added attribute-based access control for resources.
-* Fixes on docker deployments: use --pull always and standardized the default port to 8321
+* Fixes on docker deployments: use --pull always and standardized the default port to 8321
-* Deprecated: /v1/inspect/providers use /v1/providers/ instead
+* Deprecated: /v1/inspect/providers use /v1/providers/ instead
-
+
-### Better Engineering
+### Better Engineering
-* Consolidated scripts under the ./scripts directory.
+* Consolidated scripts under the ./scripts directory.
-* Addressed mypy violations in various modules.
+* Addressed mypy violations in various modules.
-* Added Dependabot scans for Python dependencies.
+* Added Dependabot scans for Python dependencies.
-* Implemented a scheduled workflow to update the changelog automatically.
+* Implemented a scheduled workflow to update the changelog automatically.
-* Enforced concurrency to reduce CI loads.
+* Enforced concurrency to reduce CI loads.
-
+
-
+
-### New Contributors
+### New Contributors
-* @cmodi-meta made their first contribution in https://github.com/meta-llama/llama-stack/pull/1650
+* @cmodi-meta made their first contribution in https://github.com/meta-llama/llama-stack/pull/1650
-* @jeffmaury made their first contribution in https://github.com/meta-llama/llama-stack/pull/1671
+* @jeffmaury made their first contribution in https://github.com/meta-llama/llama-stack/pull/1671
-* @derekhiggins made their first contribution in https://github.com/meta-llama/llama-stack/pull/1698
+* @derekhiggins made their first contribution in https://github.com/meta-llama/llama-stack/pull/1698
-* @Bobbins228 made their first contribution in https://github.com/meta-llama/llama-stack/pull/1745
+* @Bobbins228 made their first contribution in https://github.com/meta-llama/llama-stack/pull/1745
-
+
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.7...v0.1.8
 ---
@ -150,73 +43,73 @@ Published on: 2025-03-24T01:28:50Z
 # v0.1.7
 Published on: 2025-03-14T22:30:51Z
-## 0.1.7 Release Notes
+## 0.1.7 Release Notes
-
+
-###  Build and Test Agents
+###  Build and Test Agents
-* Inference: ImageType is now refactored to LlamaStackImageType
+* Inference: ImageType is now refactored to LlamaStackImageType
-* Inference: Added tests to measure TTFT
+* Inference: Added tests to measure TTFT
-* Inference: Bring back usage metrics
+* Inference: Bring back usage metrics
-* Agents: Added endpoint for get agent, list agents and list sessions
+* Agents: Added endpoint for get agent, list agents and list sessions
-* Agents: Automated conversion of type hints in client tool for lite llm format
+* Agents: Automated conversion of type hints in client tool for lite llm format
-* Agents: Deprecated ToolResponseMessage in agent.resume API
+* Agents: Deprecated ToolResponseMessage in agent.resume API
-* Added Provider API for listing and inspecting provider info
+* Added Provider API for listing and inspecting provider info
-
+
-### Agent Evals and Model Customization
+### Agent Evals and Model Customization
-* Eval: Added new eval benchmarks Math 500 and BFCL v3
+* Eval: Added new eval benchmarks Math 500 and BFCL v3
-* Deploy and Monitoring of Agents
+* Deploy and Monitoring of Agents
-* Telemetry: Fix tracing to work across coroutines
+* Telemetry: Fix tracing to work across coroutines
-
+
-###  Better Engineering
+###  Better Engineering
-* Display code coverage for unit tests
+* Display code coverage for unit tests
-* Updated call sites (inference, tool calls, agents) to move to async non blocking calls
+* Updated call sites (inference, tool calls, agents) to move to async non blocking calls
-* Unit tests also run on Python 3.11, 3.12, and 3.13
+* Unit tests also run on Python 3.11, 3.12, and 3.13
-* Added ollama inference to Integration tests CI
+* Added ollama inference to Integration tests CI
-* Improved documentation across examples, testing, CLI, updated providers table )
+* Improved documentation across examples, testing, CLI, updated providers table )
-
+
-
+
-
+
 ---
 # v0.1.6
 Published on: 2025-03-08T04:35:08Z
-## 0.1.6 Release Notes
+## 0.1.6 Release Notes
-
+
-### Build and Test Agents
+### Build and Test Agents
-* Inference: Fixed support for inline vllm provider
+* Inference: Fixed support for inline vllm provider
-* (**New**) Agent: Build & Monitor Agent Workflows with Llama Stack + Anthropic's Best Practice [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb)
+* (**New**) Agent: Build & Monitor Agent Workflows with Llama Stack + Anthropic's Best Practice [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb)
-* (**New**) Agent: Revamped agent [documentation](https://llama-stack.readthedocs.io/en/latest/building_applications/agent.html) with more details and examples
+* (**New**) Agent: Revamped agent [documentation](https://llama-stack.readthedocs.io/en/latest/building_applications/agent.html) with more details and examples
-* Agent: Unify tools and Python SDK Agents API
+* Agent: Unify tools and Python SDK Agents API
-* Agent: AsyncAgent Python SDK wrapper supporting async client tool calls
+* Agent: AsyncAgent Python SDK wrapper supporting async client tool calls
-* Agent: Support python functions without @client_tool decorator as client tools
+* Agent: Support python functions without @client_tool decorator as client tools
-* Agent: deprecation for allow_resume_turn flag, and remove need to specify tool_prompt_format
+* Agent: deprecation for allow_resume_turn flag, and remove need to specify tool_prompt_format
-* VectorIO: MilvusDB support added
+* VectorIO: MilvusDB support added
-
+
-### Agent Evals and Model Customization
+### Agent Evals and Model Customization
-* (**New**) Agent: Llama Stack RAG Lifecycle [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb)
+* (**New**) Agent: Llama Stack RAG Lifecycle [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb)
-* Eval: Documentation for eval, scoring, adding new benchmarks
+* Eval: Documentation for eval, scoring, adding new benchmarks
-* Eval: Distribution template to run benchmarks on llama & non-llama models
+* Eval: Distribution template to run benchmarks on llama & non-llama models
-* Eval: Ability to register new custom LLM-as-judge scoring functions
+* Eval: Ability to register new custom LLM-as-judge scoring functions
-* (**New**) Looking for contributors for open benchmarks. See [documentation](https://llama-stack.readthedocs.io/en/latest/references/evals_reference/index.html#open-benchmark-contributing-guide) for details.
+* (**New**) Looking for contributors for open benchmarks. See [documentation](https://llama-stack.readthedocs.io/en/latest/references/evals_reference/index.html#open-benchmark-contributing-guide) for details.
-
+
-### Deploy and Monitoring of Agents
+### Deploy and Monitoring of Agents
-* Better support for different log levels across all components for better monitoring
+* Better support for different log levels across all components for better monitoring
-
+
-### Better Engineering
+### Better Engineering
-* Enhance OpenAPI spec to include Error types across all APIs
+* Enhance OpenAPI spec to include Error types across all APIs
-* Moved all tests to /tests and created unit tests to run on each PR
+* Moved all tests to /tests and created unit tests to run on each PR
-* Removed all dependencies on llama-models repo
+* Removed all dependencies on llama-models repo
-
+
 ---
 # v0.1.5.1
 Published on: 2025-02-28T22:37:44Z
-## 0.1.5.1 Release Notes
+## 0.1.5.1 Release Notes
-* Fixes for security risk in https://github.com/meta-llama/llama-stack/pull/1327 and https://github.com/meta-llama/llama-stack/pull/1328
+* Fixes for security risk in https://github.com/meta-llama/llama-stack/pull/1327 and https://github.com/meta-llama/llama-stack/pull/1328
-
+
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.5...v0.1.5.1
 ---
@ -224,176 +117,176 @@ Published on: 2025-02-28T22:37:44Z
 # v0.1.5
 Published on: 2025-02-28T18:14:01Z
-## 0.1.5 Release Notes
+## 0.1.5 Release Notes
-###  Build Agents
+###  Build Agents
-* Inference: Support more non-llama models (openai, anthropic, gemini)
+* Inference: Support more non-llama models (openai, anthropic, gemini)
-* Inference: Can use the provider's model name in addition to the HF alias
+* Inference: Can use the provider's model name in addition to the HF alias
-* Inference: Fixed issues with calling tools that weren't specified in the prompt
+* Inference: Fixed issues with calling tools that weren't specified in the prompt
-* RAG: Improved system prompt for RAG and no more need for hard-coded rag-tool calling
+* RAG: Improved system prompt for RAG and no more need for hard-coded rag-tool calling
-* Embeddings: Added support for Nemo retriever embedding models
+* Embeddings: Added support for Nemo retriever embedding models
-* Tools: Added support for MCP tools in Ollama Distribution
+* Tools: Added support for MCP tools in Ollama Distribution
-* Distributions: Added new Groq distribution
+* Distributions: Added new Groq distribution
-
+
-### Customize Models
+### Customize Models
-* Save post-trained checkpoint in SafeTensor format to allow Ollama inference provider to use the post-trained model
+* Save post-trained checkpoint in SafeTensor format to allow Ollama inference provider to use the post-trained model
-
+
-### Monitor agents
+### Monitor agents
-* More comprehensive logging of agent steps including client tools
+* More comprehensive logging of agent steps including client tools
-* Telemetry inputs/outputs are now structured and queryable
+* Telemetry inputs/outputs are now structured and queryable
-* Ability to retrieve agents session, turn, step by ids
+* Ability to retrieve agents session, turn, step by ids
-
+
-### Better Engineering
+### Better Engineering
-* Moved executorch Swift code out of this repo into the llama-stack-client-swift repo, similar to kotlin
+* Moved executorch Swift code out of this repo into the llama-stack-client-swift repo, similar to kotlin
-* Move most logging to use logger instead of prints
+* Move most logging to use logger instead of prints
-* Completed text /chat-completion and /completion tests
+* Completed text /chat-completion and /completion tests
-
+
 ---
 # v0.1.4
 Published on: 2025-02-25T00:02:43Z
-## v0.1.4 Release Notes
+## v0.1.4 Release Notes
-Here are the key changes coming as part of this release:
+Here are the key changes coming as part of this release:
-
+
-### Build and Test Agents
+### Build and Test Agents
-* Inference: Added support for non-llama models
+* Inference: Added support for non-llama models
-* Inference: Added option to list all downloaded models and remove models
+* Inference: Added option to list all downloaded models and remove models
-* Agent: Introduce new api agents.resume_turn to include client side tool execution in the same turn
+* Agent: Introduce new api agents.resume_turn to include client side tool execution in the same turn
-* Agent: AgentConfig introduces new variable “tool_config” that allows for better tool configuration and system prompt overrides
+* Agent: AgentConfig introduces new variable “tool_config” that allows for better tool configuration and system prompt overrides
-* Agent: Added logging for agent step start and completion times
+* Agent: Added logging for agent step start and completion times
-* Agent: Added support for logging for tool execution metadata
+* Agent: Added support for logging for tool execution metadata
-* Embedding: Updated /inference/embeddings to support asymmetric models, truncation and variable sized outputs
+* Embedding: Updated /inference/embeddings to support asymmetric models, truncation and variable sized outputs
-* Embedding: Updated embedding models for Ollama, Together, and Fireworks with available defaults
+* Embedding: Updated embedding models for Ollama, Together, and Fireworks with available defaults
-* VectorIO: Improved performance of sqlite-vec using chunked writes
+* VectorIO: Improved performance of sqlite-vec using chunked writes
-### Agent Evals and Model Customization
+### Agent Evals and Model Customization
-* Deprecated api /eval-tasks. Use /eval/benchmark  instead
+* Deprecated api /eval-tasks. Use /eval/benchmark  instead
-* Added CPU training support for TorchTune
+* Added CPU training support for TorchTune
-### Deploy and Monitoring of Agents
+### Deploy and Monitoring of Agents
-* Consistent view of client and server tool calls in telemetry
+* Consistent view of client and server tool calls in telemetry
-### Better Engineering
+### Better Engineering
-* Made tests more data-driven for consistent evaluation
+* Made tests more data-driven for consistent evaluation
-* Fixed documentation links and improved API reference generation
+* Fixed documentation links and improved API reference generation
-* Various small fixes for build scripts and system reliability
+* Various small fixes for build scripts and system reliability
-
+
-
+
 ---
 # v0.1.3
 Published on: 2025-02-14T20:24:32Z
-## v0.1.3 Release
+## v0.1.3 Release
-
+
-Here are some key changes that are coming as part of this release.
+Here are some key changes that are coming as part of this release.
-
+
-### Build and Test Agents
+### Build and Test Agents
-Streamlined the initial development experience
+Streamlined the initial development experience
- Added support for  llama stack run --image-type venv
+- Added support for  llama stack run --image-type venv
- Enhanced vector store options with new sqlite-vec provider and improved Qdrant integration
+- Enhanced vector store options with new sqlite-vec provider and improved Qdrant integration
- vLLM improvements for tool calling and logprobs
+- vLLM improvements for tool calling and logprobs
- Better handling of sporadic code_interpreter tool calls
+- Better handling of sporadic code_interpreter tool calls
-
+
-### Agent Evals
+### Agent Evals
-Better benchmarking and Agent performance assessment
+Better benchmarking and Agent performance assessment
- Renamed eval API /eval-task to /benchmarks
+- Renamed eval API /eval-task to /benchmarks
- Improved documentation and notebooks for RAG and evals
+- Improved documentation and notebooks for RAG and evals
-
+
-### Deploy and Monitoring of Agents
+### Deploy and Monitoring of Agents
-Improved production readiness
+Improved production readiness
- Added usage metrics collection for chat completions
+- Added usage metrics collection for chat completions
- CLI improvements for provider information
+- CLI improvements for provider information
- Improved error handling and system reliability
+- Improved error handling and system reliability
- Better model endpoint handling and accessibility
+- Better model endpoint handling and accessibility
- Improved signal handling on distro server
+- Improved signal handling on distro server
-
+
-### Better Engineering
+### Better Engineering
-Infrastructure and code quality improvements
+Infrastructure and code quality improvements
- Faster text-based chat completion tests
+- Faster text-based chat completion tests
- Improved testing for non-streaming agent apis
+- Improved testing for non-streaming agent apis
- Standardized import formatting with ruff linter
+- Standardized import formatting with ruff linter
- Added conventional commits standard
+- Added conventional commits standard
- Fixed documentation parsing issues
+- Fixed documentation parsing issues
-
+
 ---
 # v0.1.2
 Published on: 2025-02-07T22:06:49Z
-# TL;DR
+# TL;DR
- Several stabilizations to development flows after the switch to `uv`
+- Several stabilizations to development flows after the switch to `uv`
- Migrated CI workflows to new OSS repo - [llama-stack-ops](https://github.com/meta-llama/llama-stack-ops)
+- Migrated CI workflows to new OSS repo - [llama-stack-ops](https://github.com/meta-llama/llama-stack-ops)
- Added automated rebuilds for ReadTheDocs
+- Added automated rebuilds for ReadTheDocs
- Llama Stack server supports HTTPS
+- Llama Stack server supports HTTPS
- Added system prompt overrides support
+- Added system prompt overrides support
- Several bug fixes and improvements to documentation (check out Kubernetes deployment guide by @terrytangyuan )
+- Several bug fixes and improvements to documentation (check out Kubernetes deployment guide by @terrytangyuan )
-
+
 ---
 # v0.1.1
 Published on: 2025-02-02T02:29:24Z
-A bunch of small / big improvements everywhere including support for Windows, switching to `uv` and many provider improvements.
+A bunch of small / big improvements everywhere including support for Windows, switching to `uv` and many provider improvements.
-
+
 ---
 # v0.1.0
 Published on: 2025-01-24T17:47:47Z
-We are excited to announce a stable API release of Llama Stack, which enables developers to build RAG applications and Agents using tools and safety shields, monitor and those agents with telemetry, and evaluate the agent with scoring functions.
+We are excited to announce a stable API release of Llama Stack, which enables developers to build RAG applications and Agents using tools and safety shields, monitor and those agents with telemetry, and evaluate the agent with scoring functions.
-
+
-## Context
+## Context
-GenAI application developers need more than just an LLM - they need to integrate tools, connect with their data sources, establish guardrails, and ground the LLM responses effectively. Currently, developers must piece together various tools and APIs, complicating the development lifecycle and increasing costs. The result is that developers are spending more time on these integrations rather than focusing on the application logic itself. The bespoke coupling of components also makes it challenging to adopt state-of-the-art solutions in the rapidly evolving GenAI space. This is particularly difficult for open models like Llama, as best practices are not widely established in the open.
+GenAI application developers need more than just an LLM - they need to integrate tools, connect with their data sources, establish guardrails, and ground the LLM responses effectively. Currently, developers must piece together various tools and APIs, complicating the development lifecycle and increasing costs. The result is that developers are spending more time on these integrations rather than focusing on the application logic itself. The bespoke coupling of components also makes it challenging to adopt state-of-the-art solutions in the rapidly evolving GenAI space. This is particularly difficult for open models like Llama, as best practices are not widely established in the open.
-
+
-Llama Stack was created to provide developers with a comprehensive and coherent interface that simplifies AI application development and codifies best practices across the Llama ecosystem. Since our launch in September 2024, we have seen a huge uptick in interest in Llama Stack APIs by both AI developers and from partners building AI services with Llama models. Partners like Nvidia, Fireworks, and Ollama have collaborated with us to develop implementations across various APIs, including inference, memory, and safety.
+Llama Stack was created to provide developers with a comprehensive and coherent interface that simplifies AI application development and codifies best practices across the Llama ecosystem. Since our launch in September 2024, we have seen a huge uptick in interest in Llama Stack APIs by both AI developers and from partners building AI services with Llama models. Partners like Nvidia, Fireworks, and Ollama have collaborated with us to develop implementations across various APIs, including inference, memory, and safety.
-
+
-With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv, conda, or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.
+With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv, conda, or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.
-
+
-## Release
+## Release
-After iterating on the APIs for the last 3 months, today we’re launching a stable release (V1) of the Llama Stack APIs and the corresponding llama-stack server and client packages(v0.1.0). We now have automated tests for providers. These tests make sure that all provider implementations are verified. Developers can now easily and reliably select distributions or providers based on their specific requirements.
+After iterating on the APIs for the last 3 months, today we’re launching a stable release (V1) of the Llama Stack APIs and the corresponding llama-stack server and client packages(v0.1.0). We now have automated tests for providers. These tests make sure that all provider implementations are verified. Developers can now easily and reliably select distributions or providers based on their specific requirements.
-
+
-There are example standalone apps in llama-stack-apps.
+There are example standalone apps in llama-stack-apps.
-
+
-
+
-## Key Features of this release
+## Key Features of this release
-
+
- **Unified API Layer**
+- **Unified API Layer**
-  - Inference: Run LLM models
+  - Inference: Run LLM models
-  - RAG: Store and retrieve knowledge for RAG
+  - RAG: Store and retrieve knowledge for RAG
-  - Agents: Build multi-step agentic workflows
+  - Agents: Build multi-step agentic workflows
-  - Tools: Register tools that can be called by the agent
+  - Tools: Register tools that can be called by the agent
-  - Safety: Apply content filtering and safety policies
+  - Safety: Apply content filtering and safety policies
-  - Evaluation: Test model and agent quality
+  - Evaluation: Test model and agent quality
-  - Telemetry: Collect and analyze usage data and complex agentic traces
+  - Telemetry: Collect and analyze usage data and complex agentic traces
-  - Post Training ( Coming Soon ): Fine tune models for specific use cases
+  - Post Training ( Coming Soon ): Fine tune models for specific use cases
-
+
- **Rich Provider Ecosystem**
+- **Rich Provider Ecosystem**
-  - Local Development: Meta's Reference, Ollama
+  - Local Development: Meta's Reference, Ollama
-  - Cloud: Fireworks, Together, Nvidia, AWS Bedrock, Groq, Cerebras
+  - Cloud: Fireworks, Together, Nvidia, AWS Bedrock, Groq, Cerebras
-  - On-premises: Nvidia NIM, vLLM, TGI, Dell-TGI
+  - On-premises: Nvidia NIM, vLLM, TGI, Dell-TGI
-  - On-device: iOS and Android support
+  - On-device: iOS and Android support
-
+
- **Built for Production**
+- **Built for Production**
-  - Pre-packaged distributions for common deployment scenarios
+  - Pre-packaged distributions for common deployment scenarios
-  - Backwards compatibility across model versions
+  - Backwards compatibility across model versions
-  - Comprehensive evaluation capabilities
+  - Comprehensive evaluation capabilities
-  - Full observability and monitoring
+  - Full observability and monitoring
-
+
- **Multiple developer interfaces**
+- **Multiple developer interfaces**
-  - CLI: Command line interface
+  - CLI: Command line interface
-  - Python SDK
+  - Python SDK
-  - Swift iOS SDK
+  - Swift iOS SDK
-  - Kotlin Android SDK
+  - Kotlin Android SDK
-
+
- **Sample llama stack applications**
+- **Sample llama stack applications**
-  - Python
+  - Python
-  - iOS
+  - iOS
-  - Android
+  - Android
-
+
-
+
 ---
@ -407,8 +300,8 @@ Published on: 2025-01-22T22:24:01Z
 # v0.0.63
 Published on: 2024-12-18T07:17:43Z
-A small but important bug-fix release to update the URL datatype for the client-SDKs. The issue affected multimodal agentic turns especially.
+A small but important bug-fix release to update the URL datatype for the client-SDKs. The issue affected multimodal agentic turns especially.
-
+
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.62...v0.0.63
 ---
@ -444,39 +337,39 @@ Published on: 2024-11-22T00:36:09Z
 # v0.0.53
 Published on: 2024-11-20T22:18:00Z
-🚀  Initial Release Notes for Llama Stack!
+🚀  Initial Release Notes for Llama Stack!
-
+
-### Added
+### Added
- Resource-oriented design for models, shields, memory banks, datasets and eval tasks
+- Resource-oriented design for models, shields, memory banks, datasets and eval tasks
- Persistence for registered objects with distribution
+- Persistence for registered objects with distribution
- Ability to persist memory banks created for FAISS
+- Ability to persist memory banks created for FAISS
- PostgreSQL KVStore implementation
+- PostgreSQL KVStore implementation
- Environment variable placeholder support in run.yaml files
+- Environment variable placeholder support in run.yaml files
- Comprehensive Zero-to-Hero notebooks and quickstart guides
+- Comprehensive Zero-to-Hero notebooks and quickstart guides
- Support for quantized models in Ollama
+- Support for quantized models in Ollama
- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM
+- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM
- Bedrock distribution with safety shields support
+- Bedrock distribution with safety shields support
- Evals API with task registration and scoring functions
+- Evals API with task registration and scoring functions
- MMLU and SimpleQA benchmark scoring functions
+- MMLU and SimpleQA benchmark scoring functions
- Huggingface dataset provider integration for benchmarks
+- Huggingface dataset provider integration for benchmarks
- Support for custom dataset registration from local paths
+- Support for custom dataset registration from local paths
- Benchmark evaluation CLI tools with visualization tables
+- Benchmark evaluation CLI tools with visualization tables
- RAG evaluation scoring functions and metrics
+- RAG evaluation scoring functions and metrics
- Local persistence for datasets and eval tasks
+- Local persistence for datasets and eval tasks
-
+
-### Changed
+### Changed
- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner)
+- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner)
- Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`)
+- Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`)
- Updated API signatures for dataset and eval task registration
+- Updated API signatures for dataset and eval task registration
- Restructured folder organization for providers
+- Restructured folder organization for providers
- Enhanced Docker build configuration
+- Enhanced Docker build configuration
- Added version prefixing for REST API routes
+- Added version prefixing for REST API routes
- Enhanced evaluation task registration workflow
+- Enhanced evaluation task registration workflow
- Improved benchmark evaluation output formatting
+- Improved benchmark evaluation output formatting
- Restructured evals folder organization for better modularity
+- Restructured evals folder organization for better modularity
-
+
-### Removed
+### Removed
- `llama stack configure` command
+- `llama stack configure` command
-
+
 ---
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -88,7 +88,7 @@ BRAVE_SEARCH_API_KEY=
 And then use this dotenv file when running client SDK tests via the following:
 ```bash
-uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
+uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py
 ```
 ## Pre-commit Hooks
@ -110,9 +110,21 @@ uv run pre-commit run --all-files
 > [!CAUTION]
 > Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
-## Running tests
+## Running unit tests
-You can find the Llama Stack testing documentation here [here](tests/README.md).
+You can run the unit tests by running:
 ```bash
 source .venv/bin/activate
 ./scripts/unit-tests.sh
 ```
 If you'd like to run for a non-default version of Python (currently 3.10), pass `PYTHON_VERSION` variable as follows:
 ```
 source .venv/bin/activate
 PYTHON_VERSION=3.13 ./scripts/unit-tests.sh
 ```
 ## Adding a new dependency to the project
@ -125,20 +137,11 @@ uv sync
 ## Coding Style
-* Comments should provide meaningful insights into the code. Avoid filler comments that simply
+* Comments should provide meaningful insights into the code. Avoid filler comments that simply describe the next step, as they create unnecessary clutter, same goes for docstrings.
-  describe the next step, as they create unnecessary clutter, same goes for docstrings.
+* Prefer comments to clarify surprising behavior and/or relationships between parts of the code rather than explain what the next line of code does.
-* Prefer comments to clarify surprising behavior and/or relationships between parts of the code
+* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like `Exception`.
  rather than explain what the next line of code does.
 * Catching exceptions, prefer using a specific exception type rather than a broad catch-all like
  `Exception`.
 * Error messages should be prefixed with "Failed to ..."
-* 4 spaces for indentation rather than tab
+* 4 spaces for indentation rather than tabs
 * When using `# noqa` to suppress a style or linter warning, include a comment explaining the
  justification for bypassing the check.
 * When using `# type: ignore` to suppress a mypy warning, include a comment explaining the
  justification for bypassing the check.
 * Don't use unicode characters in the codebase. ASCII-only is preferred for compatibility or
  readability reasons.
 ## Common Tasks
@ -167,11 +170,14 @@ If you have made changes to a provider's configuration in any form (introducing
 If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.
 ```bash
 cd docs
 uv sync --extra docs
 # This rebuilds the documentation pages.
-uv run --group docs make -C docs/ html
+uv run make html
 # This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
-uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
+uv run sphinx-autobuild source build/html --write-all
 ```
 ### Update API Documentation
@ -179,7 +185,7 @@ uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
 If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command:
 ```bash
-uv run ./docs/openapi_generator/run_openapi_generator.sh
+uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh
 ```
 The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,9 +1,8 @@
 include pyproject.toml
 include distributions/dependencies.json
 include llama_stack/models/llama/llama3/tokenizer.model
 include llama_stack/models/llama/llama4/tokenizer.model
 include llama_stack/distribution/*.sh
 include llama_stack/cli/scripts/*.sh
 include llama_stack/templates/*/*.yaml
 include llama_stack/providers/tests/test_cases/inference/*.json
 include llama_stack/models/llama/*/*.md
 include llama_stack/tests/integration/*.jpg
--- a/README.md
+++ b/README.md
@ -3,82 +3,11 @@
 [![PyPI version](https://img.shields.io/pypi/v/llama_stack.svg)](https://pypi.org/project/llama_stack/)
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/)
 [![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE)
-[![Discord](https://img.shields.io/discord/1257833999603335178?color=6A7EC2&logo=discord&logoColor=ffffff)](https://discord.gg/llama-stack)
+[![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack)
 [![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
 [![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)
-[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
+[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb)
 ### ✨🎉 Llama 4 Support  🎉✨
 We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
 <details>
 <summary>👋 Click here to see how to run Llama 4 models on Llama Stack </summary>
 \
 *Note you need 8xH100 GPU-host to run these models*
 ```bash
 pip install -U llama_stack
 MODEL="Llama-4-Scout-17B-16E-Instruct"
 # get meta url from llama.com
 llama model download --source meta --model-id $MODEL --meta-url <META_URL>
 # start a llama stack server
 INFERENCE_MODEL=meta-llama/$MODEL llama stack build --run --template meta-reference-gpu
 # install client to interact with the server
 pip install llama-stack-client
 ```
 ### CLI
 ```bash
 # Run a chat completion
 llama-stack-client --endpoint http://localhost:8321 \
 inference chat-completion \
 --model-id meta-llama/$MODEL \
 --message "write a haiku for meta's llama 4 models"
 ChatCompletionResponse(
    completion_message=CompletionMessage(content="Whispers in code born\nLlama's gentle, wise heartbeat\nFuture's soft unfold", role='assistant', stop_reason='end_of_turn', tool_calls=[]),
    logprobs=None,
    metrics=[Metric(metric='prompt_tokens', value=21.0, unit=None), Metric(metric='completion_tokens', value=28.0, unit=None), Metric(metric='total_tokens', value=49.0, unit=None)]
 )
 ```
 ### Python SDK
 ```python
 from llama_stack_client import LlamaStackClient
 client = LlamaStackClient(base_url=f"http://localhost:8321")
 model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
 prompt = "Write a haiku about coding"
 print(f"User> {prompt}")
 response = client.inference.chat_completion(
    model_id=model_id,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt},
    ],
 )
 print(f"Assistant> {response.completion_message.content}")
 ```
 As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!
 </details>
 ### 🚀 One-Line Installer 🚀
 To try Llama Stack locally, run:
 ```bash
 curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/install.sh | sh
 ```
 ### Overview
 Llama Stack standardizes the core building blocks that simplify AI application development. It codifies best practices across the Llama ecosystem. More specifically, it provides
@ -107,29 +36,25 @@ By reducing friction and complexity, Llama Stack empowers developers to focus on
 ### API Providers
 Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack.
-| **API Provider Builder** |    **Environments**    | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | **Post Training** |
+| **API Provider Builder** |    **Environments**    | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
-|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|:-----------------:|
+|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|
-|      Meta Reference      |      Single Node       |     ✅      |       ✅       |     ✅      |     ✅      |       ✅       |               |
+|      Meta Reference      |      Single Node       |     ✅      |       ✅       |     ✅      |     ✅      |       ✅       |
-|        SambaNova         |         Hosted         |            |       ✅       |            |     ✅      |               |                  |
+|        SambaNova         |         Hosted         |            |       ✅       |            |            |               |
-|         Cerebras         |         Hosted         |            |       ✅       |            |            |               |                  |
+|         Cerebras         |         Hosted         |            |       ✅       |            |            |               |
-|        Fireworks         |         Hosted         |     ✅      |       ✅       |     ✅      |            |               |                |
+|        Fireworks         |         Hosted         |     ✅      |       ✅       |     ✅      |            |               |
-|       AWS Bedrock        |         Hosted         |            |       ✅       |            |     ✅      |               |                |
+|       AWS Bedrock        |         Hosted         |            |       ✅       |            |     ✅      |               |
-|         Together         |         Hosted         |     ✅      |       ✅       |            |     ✅      |               |                |
+|         Together         |         Hosted         |     ✅      |       ✅       |            |     ✅      |               |
-|           Groq           |         Hosted         |            |       ✅       |            |            |               |                 |
+|           Groq           |         Hosted         |            |       ✅       |            |            |               |
-|          Ollama          |      Single Node       |            |       ✅       |            |            |               |                 |
+|          Ollama          |      Single Node       |            |       ✅       |            |            |               |
-|           TGI            | Hosted and Single Node |            |       ✅       |            |            |               |                 |
+|           TGI            | Hosted and Single Node |            |       ✅       |            |            |               |
-|        NVIDIA NIM        | Hosted and Single Node |            |       ✅       |            |            |               |                 |
+|        NVIDIA NIM        | Hosted and Single Node |            |       ✅       |            |            |               |
-|          Chroma          |      Single Node       |            |               |     ✅      |            |               |                 |
+|          Chroma          |      Single Node       |            |               |     ✅      |            |               |
-|        PG Vector         |      Single Node       |            |               |     ✅      |            |               |                 |
+|        PG Vector         |      Single Node       |            |               |     ✅      |            |               |
-|    PyTorch ExecuTorch    |     On-device iOS      |     ✅      |       ✅       |            |            |               |                |
+|    PyTorch ExecuTorch    |     On-device iOS      |     ✅      |       ✅       |            |            |               |
-|           vLLM           | Hosted and Single Node |            |       ✅       |            |            |               |                 |
+|           vLLM           | Hosted and Single Node |            |       ✅       |            |            |               |
-|          OpenAI          |         Hosted         |            |       ✅       |            |            |               |                 |
+|          OpenAI          |         Hosted         |            |       ✅       |            |            |               |
-|        Anthropic         |         Hosted         |            |       ✅       |            |            |               |                 |
+|        Anthropic         |         Hosted         |            |       ✅       |            |            |               |
-|          Gemini          |         Hosted         |            |       ✅       |            |            |               |                 |
+|          Gemini          |         Hosted         |            |       ✅       |            |            |               |
 |          watsonx         |         Hosted         |            |       ✅       |            |            |               |                 |
 |        HuggingFace       |       Single Node      |            |                |            |            |               |       ✅        |
 |         TorchTune        |       Single Node      |            |                |            |            |               |       ✅        |
 |       NVIDIA NEMO        |         Hosted         |            |                |            |            |               |       ✅        |
 ### Distributions
@ -139,6 +64,7 @@ A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider
 |               **Distribution**                |                                                                    **Llama Stack Docker**                                                                     |                                                 Start This Distribution                                                  |
 |:---------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------:|
 |                Meta Reference                 |           [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general)           |      [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html)      |
 |           Meta Reference Quantized            | [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-quantized-gpu.html) |
 |                   SambaNova                   |                     [llamastack/distribution-sambanova](https://hub.docker.com/repository/docker/llamastack/distribution-sambanova/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/sambanova.html)   |
 |                   Cerebras                    |                     [llamastack/distribution-cerebras](https://hub.docker.com/repository/docker/llamastack/distribution-cerebras/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/cerebras.html)   |
 |                    Ollama                     |                       [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general)                       |            [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html)            |
--- a/distributions/bedrock/build.yaml
+++ b/distributions/bedrock/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/bedrock/build.yaml
--- a/distributions/bedrock/compose.yaml
+++ b/distributions/bedrock/compose.yaml
@ -0,0 +1,15 @@
 services:
  llamastack:
    image: distribution-bedrock
    volumes:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/llamastack-run-bedrock.yaml
    ports:
      - "8321:8321"
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
--- a/distributions/bedrock/run.yaml
+++ b/distributions/bedrock/run.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/bedrock/run.yaml
--- a/distributions/cerebras/build.yaml
+++ b/distributions/cerebras/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/cerebras/build.yaml
--- a/distributions/cerebras/compose.yaml
+++ b/distributions/cerebras/compose.yaml
@ -0,0 +1,16 @@
 services:
  llamastack:
    image: llamastack/distribution-cerebras
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/llamastack-run-cerebras.yaml
    ports:
      - "8321:8321"
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-cerebras.yaml"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
--- a/distributions/cerebras/run.yaml
+++ b/distributions/cerebras/run.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/cerebras/run.yaml
--- a/distributions/dell-tgi/compose.yaml
+++ b/distributions/dell-tgi/compose.yaml
@ -0,0 +1,50 @@
 services:
  text-generation-inference:
    image: registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct
    network_mode: "host"
    volumes:
      - $HOME/.cache/huggingface:/data
    ports:
      - "5009:5009"
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=0,1,2,3,4
      - NUM_SHARD=4
      - MAX_BATCH_PREFILL_TOKENS=32768
      - MAX_INPUT_TOKENS=8000
      - MAX_TOTAL_TOKENS=8192
    command: []
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            # that's the closest analogue to --gpus; provide
            # an integer amount of devices or 'all'
            count: all
            # Devices are reserved using a list of capabilities, making
            # capabilities the only required field. A device MUST
            # satisfy all the requested capabilities for a successful
            # reservation.
            capabilities: [gpu]
    runtime: nvidia
  llamastack:
    depends_on:
      text-generation-inference:
        condition: service_healthy
    image: llamastack/distribution-tgi
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      # Link to TGI run.yaml file
      - ./run.yaml:/root/my-run.yaml
    ports:
      - "8321:8321"
    # Hack: wait for TGI server to start before starting docker
    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
    restart_policy:
      condition: on-failure
      delay: 3s
      max_attempts: 5
      window: 60s
--- a/distributions/dell-tgi/run.yaml
+++ b/distributions/dell-tgi/run.yaml
@ -0,0 +1,44 @@
 version: '2'
 image_name: local
 container_image: null
 conda_env: local
 apis:
 - shields
 - agents
 - models
 - memory
 - memory_banks
 - inference
 - safety
 providers:
  inference:
  - provider_id: tgi0
    provider_type: remote::tgi
    config:
      url: http://127.0.0.1:80
  safety:
  - provider_id: meta0
    provider_type: inline::llama-guard
    config:
      model: Llama-Guard-3-1B
      excluded_categories: []
  - provider_id: meta1
    provider_type: inline::prompt-guard
    config:
      model: Prompt-Guard-86M
  memory:
  - provider_id: meta0
    provider_type: inline::faiss
    config: {}
  agents:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config:
      persistence_store:
        namespace: null
        type: sqlite
        db_path: ~/.llama/runtime/kvstore.db
  telemetry:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config: {}
--- a/llama_stack/templates/dependencies.json
+++ b/llama_stack/templates/dependencies.json
@ -1,23 +1,19 @@
 {
  "bedrock": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "boto3",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
@ -25,36 +21,29 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "tqdm",
    "transformers",
    "tree_sitter",
    "uvicorn"
  ],
  "cerebras": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "cerebras_cloud_sdk",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
@ -62,38 +51,31 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "tqdm",
    "transformers",
    "tree_sitter",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "ci-tests": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "fastapi",
    "fire",
    "fireworks-ai",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
@ -101,17 +83,14 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "sqlite-vec",
    "tqdm",
    "transformers",
    "tree_sitter",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@ -119,22 +98,18 @@
  "dell": [
    "aiohttp",
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "langdetect",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
@ -142,39 +117,66 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "tqdm",
    "transformers",
-    "tree_sitter",
+    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "dev": [
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "fastapi",
    "fire",
    "fireworks-ai",
    "httpx",
    "litellm",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlite-vec",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "fireworks": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "fireworks-ai",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
@ -182,37 +184,30 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "tqdm",
    "transformers",
    "tree_sitter",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "groq": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "litellm",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
@ -220,38 +215,31 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "tqdm",
    "transformers",
    "tree_sitter",
    "uvicorn"
  ],
  "hf-endpoint": [
    "aiohttp",
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
@ -259,38 +247,31 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "tqdm",
    "transformers",
    "tree_sitter",
    "uvicorn"
  ],
  "hf-serverless": [
    "aiohttp",
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
@ -298,95 +279,13 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "tqdm",
    "transformers",
    "tree_sitter",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "kvant": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "tqdm",
    "transformers",
    "tree_sitter",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "llama_api": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "litellm",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "sqlite-vec",
    "tqdm",
    "transformers",
    "tree_sitter",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@ -394,25 +293,20 @@
  "meta-reference-gpu": [
    "accelerate",
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "fairscale",
    "faiss-cpu",
    "fastapi",
    "fbgemm-gpu-genai==1.1.2",
    "fire",
    "httpx",
    "langdetect",
    "lm-format-enforcer",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
@ -420,20 +314,55 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentence-transformers",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "torch",
    "torchao==0.8.0",
    "torchvision",
    "tqdm",
    "transformers",
-    "tree_sitter",
+    "uvicorn",
    "zmq"
  ],
  "meta-reference-quantized-gpu": [
    "accelerate",
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "fairscale",
    "faiss-cpu",
    "fastapi",
    "fbgemm-gpu",
    "fire",
    "httpx",
    "lm-format-enforcer",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentence-transformers",
    "sentencepiece",
    "torch",
    "torchao==0.5.0",
    "torchvision",
    "tqdm",
    "transformers",
    "uvicorn",
    "zmq"
  ],
@ -442,7 +371,6 @@
    "aiosqlite",
    "blobfile",
    "chardet",
    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
@ -463,7 +391,6 @@
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "tqdm",
    "transformers",
    "uvicorn"
@ -471,63 +398,19 @@
  "ollama": [
    "aiohttp",
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "ollama",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "peft",
    "pillow",
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "torch",
    "tqdm",
    "transformers",
    "tree_sitter",
    "trl",
    "uvicorn"
  ],
  "open-benchmark": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "litellm",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
@ -535,38 +418,29 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "sqlite-vec",
    "together",
    "tqdm",
    "transformers",
    "tree_sitter",
    "uvicorn"
  ],
  "passthrough": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
@ -574,33 +448,27 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "tqdm",
    "transformers",
    "tree_sitter",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "remote-vllm": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -613,16 +481,13 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "tqdm",
    "transformers",
    "tree_sitter",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@ -636,46 +501,7 @@
    "fastapi",
    "fire",
    "httpx",
    "litellm",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "starter": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "fastapi",
    "fire",
    "fireworks-ai",
    "httpx",
    "langdetect",
    "litellm",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "openai",
@ -686,41 +512,31 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "sqlite-vec",
    "tqdm",
    "transformers",
-    "tree_sitter",
+    "uvicorn"
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "tgi": [
    "aiohttp",
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
@ -728,38 +544,31 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "tqdm",
    "transformers",
    "tree_sitter",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "together": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
@ -767,79 +576,32 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "together",
    "tqdm",
    "transformers",
    "tree_sitter",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "verification": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "litellm",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "sqlite-vec",
    "tqdm",
    "transformers",
    "tree_sitter",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "vllm-gpu": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
@ -847,58 +609,16 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "tqdm",
    "transformers",
    "tree_sitter",
    "uvicorn",
    "vllm",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "watsonx": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "ibm_watson_machine_learning",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "sqlalchemy[asyncio]",
    "tqdm",
    "transformers",
    "tree_sitter",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ]
 }
--- a/distributions/fireworks/build.yaml
+++ b/distributions/fireworks/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/fireworks/build.yaml
--- a/distributions/fireworks/compose.yaml
+++ b/distributions/fireworks/compose.yaml
@ -0,0 +1,14 @@
 services:
  llamastack:
    image: llamastack/distribution-fireworks
    ports:
      - "8321:8321"
    environment:
      - FIREWORKS_API_KEY=${FIREWORKS_API_KEY}
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --template fireworks"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
--- a/distributions/fireworks/run.yaml
+++ b/distributions/fireworks/run.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/fireworks/run.yaml
--- a/distributions/meta-reference-gpu/build.yaml
+++ b/distributions/meta-reference-gpu/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/meta-reference-gpu/build.yaml
--- a/distributions/meta-reference-gpu/compose.yaml
+++ b/distributions/meta-reference-gpu/compose.yaml
@ -0,0 +1,34 @@
 services:
  llamastack:
    image: llamastack/distribution-meta-reference-gpu
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/my-run.yaml
    ports:
      - "8321:8321"
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=0
    command: []
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            # that's the closest analogue to --gpus; provide
            # an integer amount of devices or 'all'
            count: 1
            # Devices are reserved using a list of capabilities, making
            # capabilities the only required field. A device MUST
            # satisfy all the requested capabilities for a successful
            # reservation.
            capabilities: [gpu]
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
    runtime: nvidia
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
--- a/distributions/meta-reference-gpu/run-with-safety.yaml
+++ b/distributions/meta-reference-gpu/run-with-safety.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
--- a/distributions/meta-reference-gpu/run.yaml
+++ b/distributions/meta-reference-gpu/run.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/meta-reference-gpu/run.yaml
--- a/distributions/meta-reference-quantized-gpu/build.yaml
+++ b/distributions/meta-reference-quantized-gpu/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml
--- a/distributions/meta-reference-quantized-gpu/compose.yaml
+++ b/distributions/meta-reference-quantized-gpu/compose.yaml
@ -0,0 +1,35 @@
 services:
  llamastack:
    image: llamastack/distribution-meta-reference-quantized-gpu
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/my-run.yaml
    ports:
      - "8321:8321"
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=0
    command: []
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            # that's the closest analogue to --gpus; provide
            # an integer amount of devices or 'all'
            count: 1
            # Devices are reserved using a list of capabilities, making
            # capabilities the only required field. A device MUST
            # satisfy all the requested capabilities for a successful
            # reservation.
            capabilities: [gpu]
    runtime: nvidia
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
--- a/distributions/meta-reference-quantized-gpu/run.yaml
+++ b/distributions/meta-reference-quantized-gpu/run.yaml
@ -0,0 +1,58 @@
 version: '2'
 image_name: local
 container_image: null
 conda_env: local
 apis:
 - shields
 - agents
 - models
 - memory
 - memory_banks
 - inference
 - safety
 providers:
  inference:
  - provider_id: meta0
    provider_type: inline::meta-reference-quantized
    config:
      model: Llama3.2-3B-Instruct:int4-qlora-eo8
      quantization:
        type: int4
      torch_seed: null
      max_seq_len: 2048
      max_batch_size: 1
  - provider_id: meta1
    provider_type: inline::meta-reference-quantized
    config:
      # not a quantized model !
      model: Llama-Guard-3-1B
      quantization: null
      torch_seed: null
      max_seq_len: 2048
      max_batch_size: 1
  safety:
  - provider_id: meta0
    provider_type: inline::llama-guard
    config:
      model: Llama-Guard-3-1B
      excluded_categories: []
  - provider_id: meta1
    provider_type: inline::prompt-guard
    config:
      model: Prompt-Guard-86M
  memory:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config: {}
  agents:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config:
      persistence_store:
        namespace: null
        type: sqlite
        db_path: ~/.llama/runtime/kvstore.db
  telemetry:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config: {}
--- a/distributions/ollama/build.yaml
+++ b/distributions/ollama/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/ollama/build.yaml
--- a/distributions/ollama/compose.yaml
+++ b/distributions/ollama/compose.yaml
@ -0,0 +1,71 @@
 services:
  ollama:
    image: ollama/ollama:latest
    network_mode: ${NETWORK_MODE:-bridge}
    volumes:
      - ~/.ollama:/root/.ollama
    ports:
      - "11434:11434"
    environment:
      OLLAMA_DEBUG: 1
    command: []
    deploy:
      resources:
        limits:
          memory: 8G    # Set maximum memory
        reservations:
          memory: 8G    # Set minimum memory reservation
    # healthcheck:
    #   # ugh, no CURL in ollama image
    #   test: ["CMD", "curl", "-f", "http://ollama:11434"]
    #   interval: 10s
    #   timeout: 5s
    #   retries: 5
  ollama-init:
    image: ollama/ollama:latest
    depends_on:
      - ollama
        # condition: service_healthy
    network_mode: ${NETWORK_MODE:-bridge}
    environment:
      - OLLAMA_HOST=ollama
      - INFERENCE_MODEL=${INFERENCE_MODEL}
      - SAFETY_MODEL=${SAFETY_MODEL:-}
    volumes:
      - ~/.ollama:/root/.ollama
      - ./pull-models.sh:/pull-models.sh
    entrypoint: ["/pull-models.sh"]
  llamastack:
    depends_on:
      ollama:
        condition: service_started
      ollama-init:
        condition: service_started
    image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama}
    network_mode: ${NETWORK_MODE:-bridge}
    volumes:
      - ~/.llama:/root/.llama
      # Link to ollama run.yaml file
      - ~/local/llama-stack/:/app/llama-stack-source
      - ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
    ports:
      - "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
    environment:
      - INFERENCE_MODEL=${INFERENCE_MODEL}
      - SAFETY_MODEL=${SAFETY_MODEL:-}
      - OLLAMA_URL=http://ollama:11434
    entrypoint: >
        python -m llama_stack.distribution.server.server /root/my-run.yaml \
        --port ${LLAMA_STACK_PORT:-8321}
    deploy:
      restart_policy:
        condition: on-failure
        delay: 10s
        max_attempts: 3
        window: 60s
 volumes:
  ollama:
  ollama-init:
  llamastack:
--- a/distributions/ollama/pull-models.sh
+++ b/distributions/ollama/pull-models.sh
@ -0,0 +1,18 @@
 #!/bin/sh
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..."
 for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do
  echo "Preloading $model..."
  if ! ollama run "$model"; then
    echo "Failed to pull and run $model"
    exit 1
  fi
 done
 echo "All models pulled successfully"
--- a/distributions/ollama/run-with-safety.yaml
+++ b/distributions/ollama/run-with-safety.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/ollama/run-with-safety.yaml
--- a/distributions/ollama/run.yaml
+++ b/distributions/ollama/run.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/ollama/run.yaml
--- a/distributions/ramalama/faiss_store.db
+++ b/distributions/ramalama/faiss_store.db
--- a/distributions/remote-nvidia/build.yaml
+++ b/distributions/remote-nvidia/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/nvidia/build.yaml
--- a/distributions/remote-nvidia/compose.yaml
+++ b/distributions/remote-nvidia/compose.yaml
@ -0,0 +1,19 @@
 services:
  llamastack:
    image: distribution-nvidia:dev
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/llamastack-run-nvidia.yaml
    ports:
      - "8321:8321"
    environment:
      - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
      - NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
--- a/distributions/remote-nvidia/run.yaml
+++ b/distributions/remote-nvidia/run.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/nvidia/run.yaml
--- a/distributions/remote-vllm/build.yaml
+++ b/distributions/remote-vllm/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/remote-vllm/build.yaml
--- a/distributions/remote-vllm/compose.yaml
+++ b/distributions/remote-vllm/compose.yaml
@ -0,0 +1,99 @@
 services:
  vllm-inference:
    image: vllm/vllm-openai:latest
    volumes:
      - $HOME/.cache/huggingface:/root/.cache/huggingface
    network_mode: ${NETWORK_MODE:-bridged}
    ports:
       - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
    command: >
      --gpu-memory-utilization 0.75
      --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      --enforce-eager
      --max-model-len 8192
      --max-num-seqs 16
      --port ${VLLM_INFERENCE_PORT:-5100}
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
      interval: 30s
      timeout: 10s
      retries: 5
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            capabilities: [gpu]
    runtime: nvidia
  # A little trick:
  # if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
  # otherwise, the entry will end in a hyphen which gets ignored by docker compose
  vllm-${VLLM_SAFETY_MODEL:+safety}:
    image: vllm/vllm-openai:latest
    volumes:
      - $HOME/.cache/huggingface:/root/.cache/huggingface
    network_mode: ${NETWORK_MODE:-bridged}
    ports:
      - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
    command: >
      --gpu-memory-utilization 0.75
      --model ${VLLM_SAFETY_MODEL}
      --enforce-eager
      --max-model-len 8192
      --max-num-seqs 16
      --port ${VLLM_SAFETY_PORT:-5101}
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
      interval: 30s
      timeout: 10s
      retries: 5
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            capabilities: [gpu]
    runtime: nvidia
  llamastack:
    depends_on:
      - vllm-inference:
          condition: service_healthy
      - vllm-${VLLM_SAFETY_MODEL:+safety}:
          condition: service_healthy
    image: llamastack/distribution-remote-vllm:test-0.0.52rc3
    volumes:
      - ~/.llama:/root/.llama
      - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
    network_mode: ${NETWORK_MODE:-bridged}
    environment:
      - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
      - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      - MAX_TOKENS=${MAX_TOKENS:-4096}
      - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
    ports:
      - "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
    # Hack: wait for vLLM server to start before starting docker
    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 8321"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
 volumes:
  vllm-inference:
  vllm-safety:
  llamastack:
--- a/distributions/remote-vllm/run-with-safety.yaml
+++ b/distributions/remote-vllm/run-with-safety.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/remote-vllm/run-with-safety.yaml
--- a/distributions/remote-vllm/run.yaml
+++ b/distributions/remote-vllm/run.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/remote-vllm/run.yaml
--- a/distributions/runpod/build.yaml
+++ b/distributions/runpod/build.yaml
@ -0,0 +1,9 @@
 name: runpod
 distribution_spec:
  description: Use Runpod for running LLM inference
  providers:
    inference: remote::runpod
    memory: meta-reference
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
--- a/distributions/sambanova/build.yaml
+++ b/distributions/sambanova/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/sambanova/build.yaml
--- a/distributions/sambanova/compose.yaml
+++ b/distributions/sambanova/compose.yaml
@ -0,0 +1,16 @@
 services:
  llamastack:
    image: llamastack/distribution-sambanova
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/llamastack-run-sambanova.yaml
    ports:
      - "5000:5000"
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-sambanova.yaml"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
--- a/distributions/sambanova/run.yaml
+++ b/distributions/sambanova/run.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/sambanova/run.yaml
--- a/distributions/tgi/build.yaml
+++ b/distributions/tgi/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/tgi/build.yaml
--- a/distributions/tgi/compose.yaml
+++ b/distributions/tgi/compose.yaml
@ -0,0 +1,103 @@
 services:
  tgi-inference:
    image: ghcr.io/huggingface/text-generation-inference:latest
    volumes:
      - $HOME/.cache/huggingface:/data
    network_mode: ${NETWORK_MODE:-bridged}
    ports:
       - "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
      - HF_TOKEN=$HF_TOKEN
      - HF_HOME=/data
      - HF_DATASETS_CACHE=/data
      - HF_MODULES_CACHE=/data
      - HF_HUB_CACHE=/data
    command: >
      --dtype bfloat16
      --usage-stats off
      --sharded false
      --model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      --port ${TGI_INFERENCE_PORT:-8080}
      --cuda-memory-fraction 0.75
    healthcheck:
      test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
      interval: 5s
      timeout: 5s
      retries: 30
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            capabilities: [gpu]
    runtime: nvidia
  tgi-${TGI_SAFETY_MODEL:+safety}:
    image: ghcr.io/huggingface/text-generation-inference:latest
    volumes:
      - $HOME/.cache/huggingface:/data
    network_mode: ${NETWORK_MODE:-bridged}
    ports:
       - "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
      - HF_TOKEN=$HF_TOKEN
      - HF_HOME=/data
      - HF_DATASETS_CACHE=/data
      - HF_MODULES_CACHE=/data
      - HF_HUB_CACHE=/data
    command: >
      --dtype bfloat16
      --usage-stats off
      --sharded false
      --model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
      --port ${TGI_SAFETY_PORT:-8081}
      --cuda-memory-fraction 0.75
    healthcheck:
      test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
      interval: 5s
      timeout: 5s
      retries: 30
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            capabilities: [gpu]
    runtime: nvidia
  llamastack:
    depends_on:
      tgi-inference:
        condition: service_healthy
      tgi-${TGI_SAFETY_MODEL:+safety}:
        condition: service_healthy
    image: llamastack/distribution-tgi:test-0.0.52rc3
    network_mode: ${NETWORK_MODE:-bridged}
    volumes:
      - ~/.llama:/root/.llama
      - ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
    ports:
      - "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
    # Hack: wait for TGI server to start before starting docker
    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
    restart_policy:
      condition: on-failure
      delay: 3s
      max_attempts: 5
      window: 60s
    environment:
      - TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
      - SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
 volumes:
  tgi-inference:
  tgi-safety:
  llamastack:
--- a/distributions/tgi/run-with-safety.yaml
+++ b/distributions/tgi/run-with-safety.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/tgi/run-with-safety.yaml
--- a/distributions/tgi/run.yaml
+++ b/distributions/tgi/run.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/tgi/run.yaml
--- a/distributions/together/build.yaml
+++ b/distributions/together/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/together/build.yaml
--- a/distributions/together/compose.yaml
+++ b/distributions/together/compose.yaml
@ -0,0 +1,14 @@
 services:
  llamastack:
    image: llamastack/distribution-together
    ports:
      - "8321:8321"
    environment:
      - TOGETHER_API_KEY=${TOGETHER_API_KEY}
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --template together"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
--- a/distributions/together/run.yaml
+++ b/distributions/together/run.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/together/run.yaml
--- a/distributions/vllm-gpu/build.yaml
+++ b/distributions/vllm-gpu/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/inline-vllm/build.yaml
--- a/distributions/vllm-gpu/compose.yaml
+++ b/distributions/vllm-gpu/compose.yaml
@ -0,0 +1,35 @@
 services:
  llamastack:
    image: llamastack/distribution-inline-vllm
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/my-run.yaml
    ports:
      - "8321:8321"
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=0
    command: []
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            # that's the closest analogue to --gpus; provide
            # an integer amount of devices or 'all'
            count: 1
            # Devices are reserved using a list of capabilities, making
            # capabilities the only required field. A device MUST
            # satisfy all the requested capabilities for a successful
            # reservation.
            capabilities: [gpu]
    runtime: nvidia
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
--- a/distributions/vllm-gpu/run.yaml
+++ b/distributions/vllm-gpu/run.yaml
@ -0,0 +1,66 @@
 version: '2'
 image_name: local
 container_image: null
 conda_env: local
 apis:
 - shields
 - agents
 - models
 - memory
 - memory_banks
 - inference
 - safety
 providers:
  inference:
  - provider_id: vllm-inference
    provider_type: inline::vllm
    config:
      model: Llama3.2-3B-Instruct
      tensor_parallel_size: 1
      gpu_memory_utilization: 0.4
      enforce_eager: true
      max_tokens: 4096
  - provider_id: vllm-inference-safety
    provider_type: inline::vllm
    config:
      model: Llama-Guard-3-1B
      tensor_parallel_size: 1
      gpu_memory_utilization: 0.2
      enforce_eager: true
      max_tokens: 4096
  safety:
  - provider_id: meta0
    provider_type: inline::llama-guard
    config:
      model: Llama-Guard-3-1B
      excluded_categories: []
  # Uncomment to use prompt guard
  # - provider_id: meta1
  #   provider_type: inline::prompt-guard
  #   config:
  #     model: Prompt-Guard-86M
  memory:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config: {}
  # Uncomment to use pgvector
  # - provider_id: pgvector
  #   provider_type: remote::pgvector
  #   config:
  #     host: 127.0.0.1
  #     port: 5432
  #     db: postgres
  #     user: postgres
  #     password: mysecretpassword
  agents:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config:
      persistence_store:
        namespace: null
        type: sqlite
        db_path: ~/.llama/runtime/agents_store.db
  telemetry:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config: {}
--- a/docs/_static/css/my_theme.css
+++ b/docs/_static/css/my_theme.css
@ -16,20 +16,3 @@
 .hide-title h1 {
    display: none;
 }
 h2, h3, h4 {
    font-weight: normal;
 }
 html[data-theme="dark"] .rst-content div[class^="highlight"] {
  background-color: #0b0b0b;
 }
 pre {
    white-space: pre-wrap !important;
    word-break: break-all;
 }
 [data-theme="dark"] .mermaid {
    background-color: #f4f4f6 !important;
    border-radius: 6px;
    padding: 0.5em;
  }
--- a/docs/_static/js/detect_theme.js
+++ b/docs/_static/js/detect_theme.js
@ -1,32 +0,0 @@
 document.addEventListener("DOMContentLoaded", function () {
  const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
  const htmlElement = document.documentElement;
  // Check if theme is saved in localStorage
  const savedTheme = localStorage.getItem("sphinx-rtd-theme");
  if (savedTheme) {
    // Use the saved theme preference
    htmlElement.setAttribute("data-theme", savedTheme);
    document.body.classList.toggle("dark", savedTheme === "dark");
  } else {
    // Fall back to system preference
    const theme = prefersDark ? "dark" : "light";
    htmlElement.setAttribute("data-theme", theme);
    document.body.classList.toggle("dark", theme === "dark");
    // Save initial preference
    localStorage.setItem("sphinx-rtd-theme", theme);
  }
  // Listen for theme changes from the existing toggle
  const observer = new MutationObserver(function(mutations) {
    mutations.forEach(function(mutation) {
      if (mutation.attributeName === "data-theme") {
        const currentTheme = htmlElement.getAttribute("data-theme");
        localStorage.setItem("sphinx-rtd-theme", currentTheme);
      }
    });
  });
  observer.observe(htmlElement, { attributes: true });
 });
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
--- a/docs/_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
--- a/docs/_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
--- a/docs/_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
--- a/docs/getting_started_llama_api.ipynb
+++ b/docs/getting_started_llama_api.ipynb
--- a/docs/make.bat
+++ b/docs/make.bat
@ -1,35 +1,35 @@
-@ECHO OFF
+@ECHO OFF
-
+
-pushd %~dp0
+pushd %~dp0
-
+
-REM Command file for Sphinx documentation
+REM Command file for Sphinx documentation
-
+
-if "%SPHINXBUILD%" == "" (
+if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
+	set SPHINXBUILD=sphinx-build
-)
+)
-set SOURCEDIR=.
+set SOURCEDIR=.
-set BUILDDIR=_build
+set BUILDDIR=_build
-
+
-%SPHINXBUILD% >NUL 2>NUL
+%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
+if errorlevel 9009 (
-	echo.
+	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
+	echo.may add the Sphinx directory to PATH.
-	echo.
+	echo.
-	echo.If you don't have Sphinx installed, grab it from
+	echo.If you don't have Sphinx installed, grab it from
-	echo.https://www.sphinx-doc.org/
+	echo.https://www.sphinx-doc.org/
-	exit /b 1
+	exit /b 1
-)
+)
-
+
-if "%1" == "" goto help
+if "%1" == "" goto help
-
+
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
+goto end
-
+
-:help
+:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
+
-:end
+:end
-popd
+popd
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
--- a/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
+++ b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
@ -840,6 +840,7 @@
    "    \"memory_optimizations.rst\",\n",
    "    \"chat.rst\",\n",
    "    \"llama3.rst\",\n",
    "    \"datasets.rst\",\n",
    "    \"qat_finetune.rst\",\n",
    "    \"lora_finetune.rst\",\n",
    "]\n",
@ -1585,6 +1586,7 @@
    "    \"memory_optimizations.rst\",\n",
    "    \"chat.rst\",\n",
    "    \"llama3.rst\",\n",
    "    \"datasets.rst\",\n",
    "    \"qat_finetune.rst\",\n",
    "    \"lora_finetune.rst\",\n",
    "]\n",
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@ -44,14 +44,13 @@ def main(output_dir: str):
    if return_type_errors:
        print("\nAPI Method Return Type Validation Errors:\n")
        for error in return_type_errors:
-            print(error, file=sys.stderr)
+            print(error)
        sys.exit(1)
    now = str(datetime.now())
    print(
        "Converting the spec to YAML (openapi.yaml) and HTML (openapi.html) at " + now
    )
    print("")
    spec = Specification(
        LlamaStack,
        Options(
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -6,7 +6,6 @@
 import hashlib
 import ipaddress
 import types
 import typing
 from dataclasses import make_dataclass
 from typing import Any, Dict, Set, Union
@ -180,7 +179,7 @@ class ContentBuilder:
        "Creates the content subtree for a request or response."
        def is_iterator_type(t):
-            return "StreamChunk" in str(t) or "OpenAIResponseObjectStream" in str(t)
+            return "StreamChunk" in str(t)
        def get_media_type(t):
            if is_generic_list(t):
@ -190,7 +189,7 @@ class ContentBuilder:
            else:
                return "application/json"
-        if typing.get_origin(payload_type) in (typing.Union, types.UnionType):
+        if typing.get_origin(payload_type) is typing.Union:
            media_types = []
            item_types = []
            for x in typing.get_args(payload_type):
@ -520,7 +519,7 @@ class Generator:
        )
    def _build_extra_tag_groups(
-        self, extra_types: Dict[str, Dict[str, type]]
+        self, extra_types: Dict[str, List[type]]
    ) -> Dict[str, List[Tag]]:
        """
        Creates a dictionary of tag group captions as keys, and tag lists as values.
@ -533,8 +532,9 @@ class Generator:
        for category_name, category_items in extra_types.items():
            tag_list: List[Tag] = []
-            for name, extra_type in category_items.items():
+            for extra_type in category_items:
-                schema = self.schema_builder.classdef_to_schema(extra_type)
+                name = python_type_to_name(extra_type)
                schema = self.schema_builder.classdef_to_named_schema(name, extra_type)
                tag_list.append(self._build_type_tag(name, schema))
            if tag_list:
@ -759,7 +759,7 @@ class Generator:
        )
        return Operation(
-            tags=[getattr(op.defining_class, "API_NAMESPACE", op.defining_class.__name__)],
+            tags=[op.defining_class.__name__],
            summary=None,
            # summary=doc_string.short_description,
            description=description,
@ -805,8 +805,6 @@ class Generator:
        operation_tags: List[Tag] = []
        for cls in endpoint_classes:
            doc_string = parse_type(cls)
            if hasattr(cls, "API_NAMESPACE") and cls.API_NAMESPACE != cls.__name__:
                continue
            operation_tags.append(
                Tag(
                    name=cls.__name__,
@ -865,7 +863,7 @@ class Generator:
        for caption, extra_tag_group in extra_tag_groups.items():
            tag_groups.append(
                TagGroup(
-                    name=caption,
+                    name=self.options.map(caption),
                    tags=sorted(tag.name for tag in extra_tag_group),
                )
            )
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@ -132,18 +132,7 @@ def _validate_api_method_return_type(method) -> str | None:
    return_type = hints['return']
    if is_optional_type(return_type):
-        return "returns Optional type where a return value is mandatory"
+        return "returns Optional type"
 def _validate_api_method_doesnt_return_list(method) -> str | None:
    hints = get_type_hints(method)
    if 'return' not in hints:
        return "has no return type annotation"
    return_type = hints['return']
    if get_origin(return_type) is list:
        return "returns a list where a PaginatedResponse or List*Response object is expected"
 def _validate_api_delete_method_returns_none(method) -> str | None:
@ -154,84 +143,15 @@ def _validate_api_delete_method_returns_none(method) -> str | None:
    return_type = hints['return']
    if return_type is not None and return_type is not type(None):
-        return "does not return None where None is mandatory"
+        return "does not return None"
 def _validate_list_parameters_contain_data(method) -> str | None:
    hints = get_type_hints(method)
    if 'return' not in hints:
        return "has no return type annotation"
    return_type = hints['return']
    if not inspect.isclass(return_type):
        return
    if not return_type.__name__.startswith('List'):
        return
    if 'data' not in return_type.model_fields:
        return "does not have a mandatory data attribute containing the list of objects"
 def _validate_has_ellipsis(method) -> str | None:
    source = inspect.getsource(method)
    if "..." not in source and not "NotImplementedError" in source:
        return "does not contain ellipsis (...) in its implementation"
 def _validate_has_return_in_docstring(method) -> str | None:
    source = inspect.getsource(method)
    return_type = method.__annotations__.get('return')
    if return_type is not None and return_type != type(None) and ":returns:" not in source:
        return "does not have a ':returns:' in its docstring"
 def _validate_has_params_in_docstring(method) -> str | None:
    source = inspect.getsource(method)
    sig = inspect.signature(method)
    # Only check if the method has more than one parameter
    if len(sig.parameters) > 1 and ":param" not in source:
        return "does not have a ':param' in its docstring"
 def _validate_has_no_return_none_in_docstring(method) -> str | None:
    source = inspect.getsource(method)
    return_type = method.__annotations__.get('return')
    if return_type is None and ":returns: None" in source:
        return "has a ':returns: None' in its docstring which is redundant for None-returning functions"
 def _validate_docstring_lines_end_with_dot(method) -> str | None:
    docstring = inspect.getdoc(method)
    if docstring is None:
        return None
    lines = docstring.split('\n')
    for line in lines:
        line = line.strip()
        if line and not any(line.endswith(char) for char in '.:{}[]()",'):
            return f"docstring line '{line}' does not end with a valid character: . : {{ }} [ ] ( ) , \""
 _VALIDATORS = {
    "GET": [
        _validate_api_method_return_type,
        _validate_list_parameters_contain_data,
        _validate_api_method_doesnt_return_list,
        _validate_has_ellipsis,
        _validate_has_return_in_docstring,
        _validate_has_params_in_docstring,
        _validate_docstring_lines_end_with_dot,
    ],
    "DELETE": [
        _validate_api_delete_method_returns_none,
        _validate_has_ellipsis,
        _validate_has_return_in_docstring,
        _validate_has_params_in_docstring,
        _validate_has_no_return_none_in_docstring
    ],
    "POST": [
        _validate_has_ellipsis,
        _validate_has_return_in_docstring,
        _validate_has_params_in_docstring,
        _validate_has_no_return_none_in_docstring,
        _validate_docstring_lines_end_with_dot,
    ],
 }
--- a/docs/readme.md
+++ b/docs/readme.md
@ -2,14 +2,6 @@
 Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [ReadTheDocs page](https://llama-stack.readthedocs.io/en/latest/index.html).
 ## Render locally
 From the llama-stack root directory, run the following command to render the docs locally:
 ```bash
 uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
 ```
 You can open up the docs in your browser at http://localhost:8000
 ## Content
 Try out Llama Stack's capabilities through our detailed Jupyter notebooks:
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -0,0 +1,14 @@
 sphinx==8.1.3
 myst-parser
 linkify
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 sphinx-rtd-theme>=1.0.0
 sphinx-pdj-theme
 sphinx-copybutton
 sphinx-tabs
 sphinx-design
 sphinxcontrib-openapi
 sphinxcontrib-redoc
 sphinxcontrib-mermaid
 sphinxcontrib-video
 tomli
--- a/docs/source/building_applications/agent.md
+++ b/docs/source/building_applications/agent.md
@ -1,9 +1,6 @@
-# Agents
+# Llama Stack Agent Framework
-An Agent in Llama Stack is a powerful abstraction that allows you to build complex AI applications.
+The Llama Stack agent framework is built on a modular architecture that allows for flexible and powerful AI applications. This document explains the key components and how they work together.
 The Llama Stack agent framework is built on a modular architecture that allows for flexible and powerful AI
 applications. This document explains the key components and how they work together.
 ## Core Concepts
--- a/docs/source/building_applications/agent_execution_loop.md
+++ b/docs/source/building_applications/agent_execution_loop.md
@ -1,10 +1,6 @@
 ## Agent Execution Loop
-Agents are the heart of Llama Stack applications. They combine inference, memory, safety, and tool usage into coherent
+Agents are the heart of complex AI applications. They combine inference, memory, safety, and tool usage into coherent workflows. At its core, an agent follows a sophisticated execution loop that enables multi-step reasoning, tool usage, and safety checks.
 workflows. At its core, an agent follows a sophisticated execution loop that enables multi-step reasoning, tool usage,
 and safety checks.
 ### Steps in the Agent Workflow
 Each agent turn follows these key steps:
@ -68,10 +64,7 @@ sequenceDiagram
    S->>U: 5. Final Response
 ```
-Each step in this process can be monitored and controlled through configurations.
+Each step in this process can be monitored and controlled through configurations. Here's an example that demonstrates monitoring the agent's execution:
 ### Agent Execution Loop Example
 Here's an example that demonstrates monitoring the agent's execution:
 ```python
 from llama_stack_client import LlamaStackClient, Agent, AgentEventLogger
--- a/docs/source/building_applications/index.md
+++ b/docs/source/building_applications/index.md
@ -1,4 +1,4 @@
-# Building AI Applications (Examples)
+# Building AI Applications
 Llama Stack provides all the building blocks needed to create sophisticated AI applications.
@ -8,9 +8,9 @@ The best way to get started is to look at this notebook which walks through the
 Here are some key topics that will help you build effective agents:
 - **[RAG (Retrieval-Augmented Generation)](rag)**: Learn how to enhance your agents with external knowledge through retrieval mechanisms.
 - **[Agent](agent)**: Understand the components and design patterns of the Llama Stack agent framework.
 - **[Agent Execution Loop](agent_execution_loop)**: Understand how agents process information, make decisions, and execute actions in a continuous loop.
 - **[RAG (Retrieval-Augmented Generation)](rag)**: Learn how to enhance your agents with external knowledge through retrieval mechanisms.
 - **[Tools](tools)**: Extend your agents' capabilities by integrating with external tools and APIs.
 - **[Evals](evals)**: Evaluate your agents' effectiveness and identify areas for improvement.
 - **[Telemetry](telemetry)**: Monitor and analyze your agents' performance and behavior.
@ -20,11 +20,12 @@ Here are some key topics that will help you build effective agents:
 :hidden:
 :maxdepth: 1
 rag
 agent
 agent_execution_loop
 rag
 tools
 evals
 telemetry
 evals
 advanced_agent_patterns
 safety
 ```
--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@ -1,11 +1,11 @@
-## Retrieval Augmented Generation (RAG)
+## Using Retrieval Augmented Generation (RAG)
 RAG enables your applications to reference and recall information from previous interactions or external documents.
 Llama Stack organizes the APIs that enable RAG into three layers:
-1. The lowermost APIs deal with raw storage and retrieval. These include Vector IO, KeyValue IO (coming soon) and Relational IO (also coming soon.).
+- the lowermost APIs deal with raw storage and retrieval. These include Vector IO, KeyValue IO (coming soon) and Relational IO (also coming soon.)
-2. The next is the "Rag Tool", a first-class tool as part of the [Tools API](tools.md) that allows you to ingest documents (from URLs, files, etc) with various chunking strategies and query them smartly.
+- next is the "Rag Tool", a first-class tool as part of the Tools API that allows you to ingest documents (from URLs, files, etc) with various chunking strategies and query them smartly.
-3. Finally, it all comes together with the top-level ["Agents" API](agent.md) that allows you to create agents that can use the tools to answer questions, perform tasks, and more.
+- finally, it all comes together with the top-level "Agents" API that allows you to create agents that can use the tools to answer questions, perform tasks, and more.
 <img src="rag.png" alt="RAG System" width="50%">
@ -17,19 +17,14 @@ We may add more storage types like Graph IO in the future.
 ### Setting up Vector DBs
 For this guide, we will use [Ollama](https://ollama.com/) as the inference provider.
 Ollama is an LLM runtime that allows you to run Llama models locally.
 Here's how to set up a vector database for RAG:
 ```python
 # Create http client
 import os
 from llama_stack_client import LlamaStackClient
 client = LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")
 # Register a vector db
 vector_db_id = "my_documents"
 response = client.vector_dbs.register(
@ -38,53 +33,17 @@ response = client.vector_dbs.register(
    embedding_dimension=384,
    provider_id="faiss",
 )
 ```
 ### Ingesting Documents
 You can ingest documents into the vector database using two methods: directly inserting pre-chunked
 documents or using the RAG Tool.
 ```python
 # You can insert a pre-chunked document directly into the vector db
 chunks = [
    {
        "document_id": "doc1",
        "content": "Your document text here",
        "mime_type": "text/plain",
        "metadata": {
            "document_id": "doc1",
            "author": "Jane Doe",
        },
    },
 ]
 client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
 ```
 #### Using Precomputed Embeddings
 If you decide to precompute embeddings for your documents, you can insert them directly into the vector database by
 including the embedding vectors in the chunk data. This is useful if you have a separate embedding service or if you
 want to customize the ingestion process.
 ```python
 chunks_with_embeddings = [
    {
        "content": "First chunk of text",
        "mime_type": "text/plain",
        "embedding": [0.1, 0.2, 0.3, ...],  # Your precomputed embedding vector
        "metadata": {"document_id": "doc1", "section": "introduction"},
    },
    {
        "content": "Second chunk of text",
        "mime_type": "text/plain",
        "embedding": [0.2, 0.3, 0.4, ...],  # Your precomputed embedding vector
        "metadata": {"document_id": "doc1", "section": "methodology"},
    },
 ]
 client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks_with_embeddings)
 ```
 When providing precomputed embeddings, ensure the embedding dimension matches the embedding_dimension specified when
 registering the vector database.
 ### Retrieval
 You can query the vector database to retrieve documents based on their embeddings.
 ```python
 # You can then query for these chunks
 chunks_response = client.vector_io.query(
    vector_db_id=vector_db_id, query="What do you know about..."
@ -93,9 +52,7 @@ chunks_response = client.vector_io.query(
 ### Using the RAG Tool
-A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc.
+A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc. and automatically chunks them into smaller pieces.
 and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the
 [appendix](#more-ragdocument-examples).
 ```python
 from llama_stack_client import RAGDocument
@ -124,17 +81,6 @@ results = client.tool_runtime.rag_tool.query(
 )
 ```
 You can configure how the RAG tool adds metadata to the context if you find it useful for your application. Simply add:
 ```python
 # Query documents
 results = client.tool_runtime.rag_tool.query(
    vector_db_ids=[vector_db_id],
    content="What do you know about...",
    query_config={
        "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
    },
 )
 ```
 ### Building RAG-Enhanced Agents
 One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
@ -152,12 +98,6 @@ agent = Agent(
            "name": "builtin::rag/knowledge_search",
            "args": {
                "vector_db_ids": [vector_db_id],
                # Defaults
                "query_config": {
                    "chunk_size_in_tokens": 512,
                    "chunk_overlap_in_tokens": 0,
                    "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
                },
            },
        }
    ],
@ -222,38 +162,3 @@ for vector_db_id in client.vector_dbs.list():
    print(f"Unregistering vector database: {vector_db_id.identifier}")
    client.vector_dbs.unregister(vector_db_id=vector_db_id.identifier)
 ```
 ### Appendix
 #### More RAGDocument Examples
 ```python
 from llama_stack_client import RAGDocument
 import base64
 RAGDocument(document_id="num-0", content={"uri": "file://path/to/file"})
 RAGDocument(document_id="num-1", content="plain text")
 RAGDocument(
    document_id="num-2",
    content={
        "type": "text",
        "text": "plain text input",
    },  # for inputs that should be treated as text explicitly
 )
 RAGDocument(
    document_id="num-3",
    content={
        "type": "image",
        "image": {"url": {"uri": "https://mywebsite.com/image.jpg"}},
    },
 )
 B64_ENCODED_IMAGE = base64.b64encode(
    requests.get(
        "https://raw.githubusercontent.com/meta-llama/llama-stack/refs/heads/main/docs/_static/llama-stack.png"
    ).content
 )
 RAGDocuemnt(
    document_id="num-4",
    content={"type": "image", "image": {"data": B64_ENCODED_IMAGE}},
 )
 ```
 for more strongly typed interaction use the typed dicts found [here](https://github.com/meta-llama/llama-stack-client-python/blob/38cd91c9e396f2be0bec1ee96a19771582ba6f17/src/llama_stack_client/types/shared_params/document.py).
--- a/Show more
+++ b/Show more
Author	SHA1	Message	Date
Xi Yan	7854885e5a	openapi	2025-03-26 12:32:24 -07:00
Xi Yan	cbb53af701	distro codegen	2025-03-26 12:31:08 -07:00
Xi Yan	bc0cd07008	Merge branch 'main' into eval_api_final	2025-03-26 12:29:45 -07:00
Xi Yan	7f12ea290f	feat(eval api): (2.3/n) remove scoring / eval impls + benchmarks (#1766 ) # What does this PR do? - Remove `/eval` and `/scoring` impls - Clean up benchmarks. The benchmarks exists in the `llama-stack-evals` repo. - Rest of grading functions will be added in follow up PR. [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan - CI [//]: # (## Documentation)	2025-03-23 16:51:17 -07:00
Xi Yan	97e7717c9b	fix precommit	2025-03-23 16:42:50 -07:00
Xi Yan	81bc051411	fix precommit	2025-03-23 16:32:06 -07:00
Xi Yan	5038f0e376	precommit	2025-03-23 16:27:56 -07:00
Xi Yan	c2eb47d7e6	precommit	2025-03-23 16:22:32 -07:00
Xi Yan	2723b05164	remove aggregation functions	2025-03-23 16:17:09 -07:00
Xi Yan	64388de068	precommit	2025-03-23 16:15:08 -07:00
Xi Yan	3f8c7a584a	precommit	2025-03-23 16:00:48 -07:00
Xi Yan	45f6d5cd08	remove ScoringFn	2025-03-23 15:57:48 -07:00
Xi Yan	a54d757ade	merge	2025-03-23 15:48:14 -07:00
Xi Yan	c1d18283d2	feat(eval api): (2.2/n) delete eval / scoring / scoring_fn apis (#1700 ) # What does this PR do? - To make it easier, delete existing `eval/scoring/scoring_function` apis. There will be a bunch of broken impls here. The sequence is: 1. migrate benchmark graders 2. clean up existing scoring functions - Add a skeleton evaluation impl to make tests pass. ## Test Plan tested in following PRs [//]: # (## Documentation)	2025-03-19 11:04:23 -07:00
Xi Yan	0048274ec0	update	2025-03-19 09:49:53 -07:00
Xi Yan	42447729e4	update	2025-03-19 09:48:30 -07:00
Xi Yan	a92756a4b7	result_data in evaluation response	2025-03-18 22:09:35 -07:00
Xi Yan	08c0c5505e	feat(eval api): (2.1/n) fix resolver for benchmark routing table + fix precommit (#1691 ) # What does this PR do? - fixes routing table so that `llama stack run` works - fixes pre-commit - one of many fixes to address implementation fix [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan ``` llama stack run ``` [//]: # (## Documentation)	2025-03-18 21:09:49 -07:00
Xi Yan	bf135f38b1	precommit	2025-03-18 20:48:03 -07:00
Xi Yan	205a50f10b	openapi gen	2025-03-18 20:38:05 -07:00
Xi Yan	24d48b3692	Merge branch 'main' into eval_api_final	2025-03-18 20:17:24 -07:00
Xi Yan	913e6eb50f	Update llama_stack/apis/evaluation/evaluation.py Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>	2025-03-18 20:16:24 -07:00
Xi Yan	820b9a00c7	Update llama_stack/apis/evaluation/evaluation.py Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>	2025-03-18 20:16:15 -07:00
Xi Yan	85cad639ca	Update llama_stack/apis/evaluation/evaluation.py Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>	2025-03-18 20:16:08 -07:00
Xi Yan	d994499f09	update EvaluationTask	2025-03-18 19:30:01 -07:00
Xi Yan	f107e3229b	update EvaluationTask	2025-03-18 19:28:34 -07:00
Xi Yan	5e817cd56a	update	2025-03-18 18:16:00 -07:00
Xi Yan	398319fe7a	agent config	2025-03-18 18:14:55 -07:00
Xi Yan	238cdc4e69	grading	2025-03-18 18:12:06 -07:00
Xi Yan	b98497ee56	docs	2025-03-18 18:10:45 -07:00
Xi Yan	e860c536da	pre	2025-03-18 18:01:40 -07:00
Xi Yan	a69759613a	comments	2025-03-18 15:01:41 -07:00
Xi Yan	a8b0467ec3	address	2025-03-18 14:51:52 -07:00
Xi Yan	5c0888c29a	comments	2025-03-18 14:50:19 -07:00
Xi Yan	46f2ba5910	Merge branch 'main' into eval_api_final	2025-03-18 14:49:57 -07:00
Xi Yan	ade3391170	revert job related	2025-03-17 17:12:28 -07:00
Xi Yan	452b2b1284	precommit	2025-03-17 17:08:21 -07:00
Xi Yan	66cd83fb58	Merge branch 'main' into eval_api_final	2025-03-17 17:00:30 -07:00
Xi Yan	62abe2899a	pre	2025-03-16 20:49:09 -07:00
Xi Yan	cb492eba37	inline -> sync	2025-03-16 20:46:07 -07:00
Xi Yan	1860751655	benchmarks	2025-03-16 19:41:40 -07:00
Xi Yan	c80d1f906b	precommit	2025-03-16 19:40:18 -07:00
Xi Yan	035b2dcb60	new apis	2025-03-16 19:33:57 -07:00
Xi Yan	d34b70e3ab	grader	2025-03-16 18:30:06 -07:00
Xi Yan	d9264a0925	dataaset	2025-03-16 16:34:56 -07:00
Xi Yan	63f1525165	precommit	2025-03-16 16:27:29 -07:00
Xi Yan	5cf7779b8f	fix integeration	2025-03-15 17:36:39 -07:00
Xi Yan	a6fa3aa5a2	feat(dataset api): (1.6/n) fix all iterrows callsites (#1660 ) # What does this PR do? - as title [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan CI ``` pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb ``` <img width="587" alt="image" src="https://github.com/user-attachments/assets/4a25f493-501e-43f4-9836-d9802223a93a" /> [//]: # (## Documentation)	2025-03-15 17:24:16 -07:00
Xi Yan	f2d93324e9	pre	2025-03-15 17:08:32 -07:00
Xi Yan	28b8c1c815	scoring fix	2025-03-15 17:06:53 -07:00
Xi Yan	6f5df08ebf	fix hf url endpoint	2025-03-15 16:50:43 -07:00
Xi Yan	a568bf3f9d	feat(dataset api): (1.5/n) fix dataset registeration (#1659 ) # What does this PR do? - fix dataset registeration & iterrows > NOTE: the URL endpoint is changed to datasetio due to flaky path routing [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan ``` LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/datasets/test_datasets.py ``` <img width="854" alt="image" src="https://github.com/user-attachments/assets/0168b352-1c5a-48d1-8e9a-93141d418e54" /> [//]: # (## Documentation)	2025-03-15 16:48:09 -07:00
Xi Yan	2c9d624910	feat(dataset api): (1.4/n) fix resolver signature mismatch (#1658 ) # What does this PR do? - fix datasets api signature mis-match so that llama stack run can start [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan ``` llama stack run ``` <img width="626" alt="image" src="https://github.com/user-attachments/assets/59072d1a-ccb6-453a-80e8-d87419896c41" /> [//]: # (## Documentation)	2025-03-15 14:56:11 -07:00
Xi Yan	72ccdc19a8	feat(datasets api): (1.3/n) patch OpenAPI gen for datasetio->datasets (#1657 ) # What does this PR do? - We need to tag DatasetIO class correctly with Datasets with the endpoint change [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan Before <img width="1474" alt="image" src="https://github.com/user-attachments/assets/48737317-28a3-4aa6-a1b5-e1ea680cef84" /> After <img width="1508" alt="image" src="https://github.com/user-attachments/assets/123322f0-a52f-47ee-99a7-ecc66c1b09ec" /> [//]: # (## Documentation)	2025-03-15 14:12:45 -07:00
Xi Yan	5cb0ad7d7f	openapi gen + precommit fix	2025-03-15 14:08:01 -07:00
Xi Yan	39f4dfbf50	feat(api): (1.2/n) datasets.iterrorws pagination api updates (#1656 ) # What does this PR do? - as title - uses "cursor" pagination scheme for iterrows [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan <img width="1226" alt="image" src="https://github.com/user-attachments/assets/3220eaac-7117-4d0a-b344-2bbb77a22065" /> [//]: # (## Documentation)	2025-03-15 13:58:47 -07:00
Xi Yan	c7d741d89e	Merge branch 'main' into pr1573	2025-03-15 13:40:18 -07:00
Xi Yan	cba4842a87	Merge branch 'main' into pr1573	2025-03-14 15:58:27 -07:00
Xi Yan	0e2a13da9c	Merge branch 'main' into pr1573	2025-03-14 12:18:00 -07:00
Xi Yan	7606e49dbc	feat(dataset api): (1.1/n) dataset api implementation fix pre-commit (#1625 ) # What does this PR do? - fix pre-commit with api updates [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan ``` pre-commit ``` [//]: # (## Documentation)	2025-03-13 16:41:03 -07:00
Xi Yan	a6095820af	docs	2025-03-13 14:48:11 -07:00
Xi Yan	89885fd2fa	datasetio->datasets	2025-03-13 14:47:06 -07:00
Xi Yan	78ec3d98f6	Merge branch 'main' into pr1573	2025-03-13 11:05:04 -07:00
Xi Yan	8b80a77fae	docs	2025-03-12 23:50:52 -07:00
Xi Yan	8a6fa41a93	more purposes	2025-03-12 23:44:18 -07:00
Xi Yan	0df33049e3	update doc	2025-03-12 23:32:54 -07:00
Xi Yan	b4d118fc5c	update doc	2025-03-12 23:30:47 -07:00
Xi Yan	772339bebf	update doc	2025-03-12 23:27:45 -07:00
Xi Yan	4f6f0f6a91	update doc	2025-03-12 23:27:01 -07:00
Xi Yan	4cc1958af9	huggingface obey consistency	2025-03-12 21:37:13 -07:00
Xi Yan	09039eca57	source	2025-03-12 18:52:05 -07:00
Xi Yan	790b2d5cc0	source	2025-03-12 18:51:46 -07:00
Xi Yan	a3173e8284	update	2025-03-12 18:46:40 -07:00
Xi Yan	18de4cd08a	comments	2025-03-12 18:38:07 -07:00
Xi Yan	8942071b3b	Merge branch 'main' into pr1573	2025-03-12 18:23:39 -07:00
Xi Yan	f840018088	Merge branch 'main' into pr1573	2025-03-12 12:31:49 -07:00
Xi Yan	31e3409909	Merge branch 'main' into pr1573	2025-03-12 11:38:02 -07:00
Xi Yan	1d80ec7f81	upgrade doc	2025-03-12 00:17:58 -07:00
Xi Yan	0abedd070c	comment	2025-03-12 00:13:27 -07:00
Xi Yan	817331e76e	precommit	2025-03-11 18:34:38 -07:00
Xi Yan	0e47c65051	update	2025-03-11 18:29:55 -07:00
Xi Yan	02aa9a1e85	remove json_schema_type decorator	2025-03-11 16:08:06 -07:00
Xi Yan	0e8a53ab69	openapi	2025-03-11 15:03:48 -07:00
Xi Yan	8592c2b48a	precommit	2025-03-11 14:56:12 -07:00
Xi Yan	bc551e6459	datasets api Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags:	2025-03-11 14:44:49 -07:00
`@ -1,2 +1,2 @@`
	`# This file documents Triage members in the Llama Stack community`	`# This file documents Triage members in the Llama Stack community`
	`@bbrowning @booxter @franciscojavierarceo @leseb`	`@franciscojavierarceo @leseb`
		`@ -0,0 +1 @@`
							`../../llama_stack/templates/bedrock/build.yaml`
		`@ -0,0 +1 @@`
							`../../llama_stack/templates/bedrock/run.yaml`
		`@ -0,0 +1 @@`
							`../../llama_stack/templates/cerebras/build.yaml`
		`@ -0,0 +1 @@`
							`../../llama_stack/templates/cerebras/run.yaml`
		`@ -0,0 +1 @@`
							`../../llama_stack/templates/fireworks/build.yaml`
		`@ -0,0 +1 @@`
							`../../llama_stack/templates/fireworks/run.yaml`
		`@ -0,0 +1 @@`
							`../../llama_stack/templates/meta-reference-gpu/build.yaml`
		`@ -0,0 +1 @@`
							`../../llama_stack/templates/ollama/build.yaml`